From d100af9d0db9f710a81b80988056e1a47fbb141e Mon Sep 17 00:00:00 2001
From: Junli Gu <Junli.Gu@gmail.com>
Date: Sat, 11 Jul 2015 01:58:10 +0800
Subject: [PATCH 001/124] This is yibing's first patch. removed all cuda files
 and added device file

---
 Makefile                                      |  16 +-
 include/caffe/common.hpp                      |  25 +-
 include/caffe/device.hpp                      |  47 ++
 include/caffe/util/math_functions.hpp         | 280 +++++++-----
 include/caffe/util/math_functions.hpp.protect | 280 ++++++++++++
 src/caffe/common.cpp                          |  22 +-
 src/caffe/device.cpp                          | 421 +++++++++++++++++
 src/caffe/layers/conv_layer.cpp               |  10 +
 .../layers/{ => cufiles}/absval_layer.cu      |   0
 .../layers/{ => cufiles}/base_data_layer.cu   |   0
 src/caffe/layers/{ => cufiles}/bnll_layer.cu  |   0
 .../layers/{ => cufiles}/concat_layer.cu      |   0
 .../{ => cufiles}/contrastive_loss_layer.cu   |   0
 src/caffe/layers/{ => cufiles}/conv_layer.cu  |   0
 .../layers/{ => cufiles}/cudnn_conv_layer.cu  |   0
 .../{ => cufiles}/cudnn_pooling_layer.cu      |   0
 .../layers/{ => cufiles}/cudnn_relu_layer.cu  |   0
 .../{ => cufiles}/cudnn_sigmoid_layer.cu      |   0
 .../{ => cufiles}/cudnn_softmax_layer.cu      |   0
 .../layers/{ => cufiles}/cudnn_tanh_layer.cu  |   0
 .../layers/{ => cufiles}/deconv_layer.cu      |   0
 .../layers/{ => cufiles}/dropout_layer.cu     |   0
 .../layers/{ => cufiles}/eltwise_layer.cu     |   0
 .../{ => cufiles}/euclidean_loss_layer.cu     |   0
 src/caffe/layers/{ => cufiles}/exp_layer.cu   |   0
 .../layers/{ => cufiles}/filter_layer.cu      |   0
 .../layers/{ => cufiles}/hdf5_data_layer.cu   |   0
 .../layers/{ => cufiles}/hdf5_output_layer.cu |   0
 .../layers/{ => cufiles}/im2col_layer.cu      |   0
 .../{ => cufiles}/inner_product_layer.cu      |   0
 src/caffe/layers/{ => cufiles}/log_layer.cu   |   0
 src/caffe/layers/{ => cufiles}/lrn_layer.cu   |   0
 src/caffe/layers/{ => cufiles}/mvn_layer.cu   |   0
 .../layers/{ => cufiles}/pooling_layer.cu     |   0
 src/caffe/layers/{ => cufiles}/power_layer.cu |   0
 src/caffe/layers/{ => cufiles}/prelu_layer.cu |   0
 .../layers/{ => cufiles}/reduction_layer.cu   |   0
 src/caffe/layers/{ => cufiles}/relu_layer.cu  |   0
 .../sigmoid_cross_entropy_loss_layer.cu       |   0
 .../layers/{ => cufiles}/sigmoid_layer.cu     |   0
 .../layers/{ => cufiles}/silence_layer.cu     |   0
 src/caffe/layers/{ => cufiles}/slice_layer.cu |   0
 .../layers/{ => cufiles}/softmax_layer.cu     |   0
 .../{ => cufiles}/softmax_loss_layer.cu       |   0
 src/caffe/layers/{ => cufiles}/split_layer.cu |   0
 src/caffe/layers/{ => cufiles}/tanh_layer.cu  |   0
 .../layers/{ => cufiles}/threshold_layer.cu   |   0
 src/caffe/layers/dropout_layer.cpp            |  10 +
 src/caffe/layers/pooling_layer.cpp            |   9 +
 src/caffe/layers/relu_layer.cpp               |  10 +
 src/caffe/syncedmem.cpp                       |  11 +-
 src/caffe/util/math_functions.cpp             | 430 ++++++++++++++----
 src/caffe/util/math_functions.cpp.protect     | 413 +++++++++++++++++
 53 files changed, 1771 insertions(+), 213 deletions(-)
 create mode 100644 include/caffe/device.hpp
 create mode 100644 include/caffe/util/math_functions.hpp.protect
 create mode 100644 src/caffe/device.cpp
 rename src/caffe/layers/{ => cufiles}/absval_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/base_data_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/bnll_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/concat_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/contrastive_loss_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/conv_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_conv_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_pooling_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_relu_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_sigmoid_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_softmax_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/cudnn_tanh_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/deconv_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/dropout_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/eltwise_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/euclidean_loss_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/exp_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/filter_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/hdf5_data_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/hdf5_output_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/im2col_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/inner_product_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/log_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/lrn_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/mvn_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/pooling_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/power_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/prelu_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/reduction_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/relu_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/sigmoid_cross_entropy_loss_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/sigmoid_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/silence_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/slice_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/softmax_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/softmax_loss_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/split_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/tanh_layer.cu (100%)
 rename src/caffe/layers/{ => cufiles}/threshold_layer.cu (100%)
 create mode 100644 src/caffe/util/math_functions.cpp.protect

diff --git a/Makefile b/Makefile
index 05b783af..80c5642d 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so
 # CXX_SRCS are the source files excluding the test ones.
 CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp")
 # CU_SRCS are the cuda source files
-CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu")
+#CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu")
 # TEST_SRCS are the test source files
 TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp
 TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp")
@@ -525,13 +525,13 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \
 		|| (cat $@.$(WARNS_EXT); exit 1)
 	@ cat $@.$(WARNS_EXT)
 
-$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS)
-	@ echo NVCC $<
-	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
-		-odir $(@D)
-	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \
-		|| (cat $@.$(WARNS_EXT); exit 1)
-	@ cat $@.$(WARNS_EXT)
+#$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS)
+#	@ echo NVCC $<
+#	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
+#		-odir $(@D)
+#	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \
+#		|| (cat $@.$(WARNS_EXT); exit 1)
+#	@ cat $@.$(WARNS_EXT)
 
 $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		| $(DYNAMIC_NAME) $(TEST_BIN_DIR)
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 5f86bc26..b1528474 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,6 +1,7 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
+#include <CL/cl_ext.h>
 #include <boost/shared_ptr.hpp>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
@@ -15,7 +16,10 @@
 #include <string>
 #include <utility>  // pair
 #include <vector>
+#include <clBLAS.h>
+#include <CL/cl.h>
 
+#include "caffe/device.hpp"
 #include "caffe/util/device_alternate.hpp"
 
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
@@ -65,6 +69,25 @@ private:\
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 
+// OpenCL: various checks for different function calls.
+#define OCL_CHECK(condition) \
+  do { \
+    cl_int error = condition; \
+    CHECK_EQ(error, CL_SUCCESS) << " " << error; \
+    if(CL_SUCCESS != error){ \
+       LOG(INFO) << "failed";\
+    } \
+  } while (0)
+
+#define CLBLAS_CHECK(flag) \
+  do { \
+     cl_int error = flag; \
+     CHECK_EQ(error, clblasSuccess) << " " << error; \
+     if (error != clblasSuccess){ \
+         LOG(INFO) << "clBlas Function Failed! Error Code:" << error; \
+     } \
+ } while(0)
+
 // See PR #1236
 namespace cv { class Mat; }
 
@@ -104,7 +127,7 @@ class Caffe {
     }
     return *singleton_;
   }
-  enum Brew { CPU, GPU };
+  enum Brew { CPU, GPU, APU };
 
   // This random number generator facade hides boost and CUDA rng
   // implementation from one another (for cross-platform compatibility).
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
new file mode 100644
index 00000000..07e65848
--- /dev/null
+++ b/include/caffe/device.hpp
@@ -0,0 +1,47 @@
+#ifndef CAFFE_DEVICE_HPP
+#define CAFFE_DEVICE_HPP
+#include <CL/cl.h>
+#include <string>
+#include <fstream>
+#include "caffe/common.hpp"
+namespace caffe {
+
+class Device{
+public:
+    Device():numPlatforms(0),numDevices(0){}
+    ~Device();
+    cl_uint numPlatforms;
+    cl_platform_id * platformIDs;
+    char platformName[64];
+    char openclVersion[64];
+    cl_uint numDevices;
+    cl_device_id * DeviceIDs;
+    cl_context Context;
+    cl_command_queue CommandQueue;
+    cl_command_queue CommandQueue_helper;
+    cl_program Program; 
+    clblasOrder col;
+    clblasOrder row;
+
+     
+    cl_int Init(); 
+    cl_int ConvertToString(const char *pFileName,std::string &Str);
+    void DisplayPlatformInfo();
+    void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
+
+    void GetDeviceInfo();
+
+    template <typename T>
+    void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str);
+    template <typename T>
+    void appendBitfield(T info, T value, std::string name, std::string &str);
+    
+
+};
+extern char* buildOption;
+extern Device amdDevice;
+
+}  // namespace caffe
+
+#endif //CAFFE_DEVICE_HPP
+
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 2cacd8e7..bcafeb89 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -1,18 +1,19 @@
+// Copyright 2014 BVLC and contributors.
+
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
 #include <stdint.h>
 #include <cmath>  // for std::fabs and std::signbit
-
+#include <CL/cl.h>
+#include <clBLAS.h>
 #include "glog/logging.h"
 
-#include "caffe/common.hpp"
-#include "caffe/util/device_alternate.hpp"
 #include "caffe/util/mkl_alternate.hpp"
 
 namespace caffe {
 
-// Caffe gemm provides a simpler interface to the gemm functions, with the
+// Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename Dtype>
 void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
@@ -20,35 +21,97 @@ void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
     const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
     Dtype* C);
 
+// Decaf gpu gemm provides an interface that is almost the same as the cpu
+// gemm function - following the c convention and calling the fortran-order
+// gpu code under the hood.
+template <typename Dtype>
+void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+template <typename Dtype>
+cl_event caffe_gpu_gemmex( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
+    Dtype* C, const int offC);
+/*This is Yuan Gao's sgemm_ex*/
+template <typename Dtype>
+void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C, const int offset1, const int offset2, const int offset3);
+
+
+template <typename Dtype>
+cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
+    Dtype* C, const int offC);
+
 template <typename Dtype>
 void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
     const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
     Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
+    const Dtype * x, size_t offx, const Dtype beta, int incx,
+    Dtype* y, size_t offy, int incy);
+
+template <typename Dtype>
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+
 template <typename Dtype>
 void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
     Dtype* Y);
 
+template <typename Dtype>
+void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
+    Dtype* Y);
+
 template <typename Dtype>
 void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
+template <typename Dtype>
+void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype *X, Dtype *Y);
 
 template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype *X);
 
-inline void caffe_memset(const size_t N, const int alpha, void* X) {
-  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
-}
+template <typename Dtype>
+void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
+
 template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
@@ -61,12 +124,35 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
+//template <typename Dtype>
+//void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, Dtype* scratch_buf);
+//CUDA version, need to be deleted
+template <typename Dtype>
+void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul(cl_kernel Kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
 template <typename Dtype>
 void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
+//CUDA version, need to be deleted
+template <typename Dtype>
+void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+//CUDA version, need to be deleted
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_powx(cl_kernel Kernel, const int n, const Dtype* a, const Dtype b, Dtype* y);
+
 unsigned int caffe_rng_rand();
 
 template <typename Dtype>
@@ -75,43 +161,62 @@ Dtype caffe_nextafter(const Dtype b);
 template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
+// caffe_gpu_rng_uniform with two arguments generates integers in the range
+// [0, UINT_MAX].
+void caffe_gpu_rng_uniform(const int n, unsigned int* r);
+
+// caffe_gpu_rng_uniform with four arguments generates floats in the range
+// (a, b] (strictly greater than a, less than or equal to b) due to the
+// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
+// curandGenerateUniform; with other limits will shift and scale the outputs
+// appropriately after calling curandGenerateUniform.
+template <typename Dtype>
+void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
 template <typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
                         Dtype* r);
 
 template <typename Dtype>
-void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
+void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+                            Dtype* r);
 
 template <typename Dtype>
-void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
 
 template <typename Dtype>
-void caffe_exp(const int n, const Dtype* a, Dtype* y);
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
 
 template <typename Dtype>
-void caffe_log(const int n, const Dtype* a, Dtype* y);
+void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
 
 template <typename Dtype>
-void caffe_abs(const int n, const Dtype* a, Dtype* y);
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
 Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 
 template <typename Dtype>
-Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
-    const Dtype* y, const int incy);
+void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
 template <typename Dtype>
 int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
+template <typename Dtype>
+uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
+                                    const Dtype* y);
+
 // Returns the sum of the absolute values of the elements of vector x
 template <typename Dtype>
 Dtype caffe_cpu_asum(const int n, const Dtype* x);
 
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
 // the branchless, type-safe version from
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
 template<typename Dtype>
-inline int8_t caffe_sign(Dtype val) {
+inline char caffe_sign(Dtype val) {
   return (Dtype(0) < val) - (val < Dtype(0));
 }
 
@@ -130,63 +235,57 @@ inline int8_t caffe_sign(Dtype val) {
     } \
   }
 
+#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \
+  template <> \
+  void caffe_cpu_##name<float>(const int n, const float* x, float* y); \
+  template <> \
+  void caffe_cpu_##name<double>(const int n, const double* x, double* y)
+
+/*
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+  CUDA_KERNEL_LOOP(index, n) { \
+    operation; \
+  } \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+   NOLINT_NEXT_LINE(whitespace/operators)  \
+  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+   NOLINT_NEXT_LINE(whitespace/operators)  \
+  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+}
+*/
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
 DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
-// This returns a nonzero value if the input has its sign bit set.
-// The name sngbit is meant to avoid conflicts with std::signbit in the macro.
-// The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
-// and we don't want that to expand here when CUDA headers are also included.
-DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
-
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
-
-template <typename Dtype>
-void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
-
-#ifndef CPU_ONLY  // GPU
-
-// Decaf gpu gemm provides an interface that is almost the same as the cpu
-// gemm function - following the c convention and calling the fortran-order
-// gpu code under the hood.
-template <typename Dtype>
-void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
-
-template <typename Dtype>
-void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-    Dtype* y);
+template<typename Dtype>
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
-template <typename Dtype>
-void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
+// This returns a nonzero value if the input has its sign bit set.
+// The name sngbit is meant to avoid conflicts with std::signbit in the macro
+using std::signbit;
+DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i]));
 
-template <typename Dtype>
-void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-    const Dtype beta, Dtype* Y);
+template<typename Dtype>
+void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
 
-void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
 template <typename Dtype>
-void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
-
-inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(caffe/alt_fn)
-#else
-  NO_GPU;
-#endif
-}
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
@@ -212,69 +311,20 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-// caffe_gpu_rng_uniform with two arguments generates integers in the range
-// [0, UINT_MAX].
-void caffe_gpu_rng_uniform(const int n, unsigned int* r);
-
-// caffe_gpu_rng_uniform with four arguments generates floats in the range
-// (a, b] (strictly greater than a, less than or equal to b) due to the
-// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
-// curandGenerateUniform; with other limits will shift and scale the outputs
-// appropriately after calling curandGenerateUniform.
-template <typename Dtype>
-void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
-
 template <typename Dtype>
-void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                            Dtype* r);
-
-template <typename Dtype>
-void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
-
-template <typename Dtype>
-void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
-uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
-                                    const Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
+void caffe_abs(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+void caffe_log(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
-
-#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
-template<typename Dtype> \
-__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
-  CUDA_KERNEL_LOOP(index, n) { \
-    operation; \
-  } \
-} \
-template <> \
-void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-} \
-template <> \
-void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-}
-
-#endif  // !CPU_ONLY
-
+Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
+    const Dtype* y, const int incy);
 }  // namespace caffe
 
+
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/math_functions.hpp.protect b/include/caffe/util/math_functions.hpp.protect
new file mode 100644
index 00000000..2cacd8e7
--- /dev/null
+++ b/include/caffe/util/math_functions.hpp.protect
@@ -0,0 +1,280 @@
+#ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
+#define CAFFE_UTIL_MATH_FUNCTIONS_H_
+
+#include <stdint.h>
+#include <cmath>  // for std::fabs and std::signbit
+
+#include "glog/logging.h"
+
+#include "caffe/common.hpp"
+#include "caffe/util/device_alternate.hpp"
+#include "caffe/util/mkl_alternate.hpp"
+
+namespace caffe {
+
+// Caffe gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <typename Dtype>
+void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+template <typename Dtype>
+void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
+    Dtype* Y);
+
+template <typename Dtype>
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype *X, Dtype *Y);
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype *X);
+
+inline void caffe_memset(const size_t N, const int alpha, void* X) {
+  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
+}
+
+template <typename Dtype>
+void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_scal(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_sqr(const int N, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
+unsigned int caffe_rng_rand();
+
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
+template <typename Dtype>
+void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
+template <typename Dtype>
+void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+                        Dtype* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
+
+template <typename Dtype>
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_log(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_abs(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
+
+template <typename Dtype>
+Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
+    const Dtype* y, const int incy);
+
+template <typename Dtype>
+int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
+
+// Returns the sum of the absolute values of the elements of vector x
+template <typename Dtype>
+Dtype caffe_cpu_asum(const int n, const Dtype* x);
+
+// the branchless, type-safe version from
+// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
+template<typename Dtype>
+inline int8_t caffe_sign(Dtype val) {
+  return (Dtype(0) < val) - (val < Dtype(0));
+}
+
+// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
+//   in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp.
+// Please refer to commit 7e8ef25c7 of the boost-eigen branch.
+// Git cherry picking that commit caused a conflict hard to resolve and
+//   copying that file in convenient for code reviewing.
+// So they have to be pasted here temporarily.
+#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(x); CHECK(y); \
+    for (int i = 0; i < n; ++i) { \
+      operation; \
+    } \
+  }
+
+// output is 1 for the positives, 0 for zero, and -1 for the negatives
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+
+// This returns a nonzero value if the input has its sign bit set.
+// The name sngbit is meant to avoid conflicts with std::signbit in the macro.
+// The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
+// and we don't want that to expand here when CUDA headers are also included.
+DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
+    y[i] = static_cast<bool>((std::signbit)(x[i])));
+
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+
+template <typename Dtype>
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+#ifndef CPU_ONLY  // GPU
+
+// Decaf gpu gemm provides an interface that is almost the same as the cpu
+// gemm function - following the c convention and calling the fortran-order
+// gpu code under the hood.
+template <typename Dtype>
+void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+template <typename Dtype>
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
+    Dtype* Y);
+
+template <typename Dtype>
+void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+
+template <typename Dtype>
+void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
+
+inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(caffe/alt_fn)
+#else
+  NO_GPU;
+#endif
+}
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
+// caffe_gpu_rng_uniform with two arguments generates integers in the range
+// [0, UINT_MAX].
+void caffe_gpu_rng_uniform(const int n, unsigned int* r);
+
+// caffe_gpu_rng_uniform with four arguments generates floats in the range
+// (a, b] (strictly greater than a, less than or equal to b) due to the
+// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
+// curandGenerateUniform; with other limits will shift and scale the outputs
+// appropriately after calling curandGenerateUniform.
+template <typename Dtype>
+void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
+template <typename Dtype>
+void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+                            Dtype* r);
+
+template <typename Dtype>
+void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
+
+template <typename Dtype>
+void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
+
+template <typename Dtype>
+uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
+                                    const Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+  CUDA_KERNEL_LOOP(index, n) { \
+    operation; \
+  } \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+}
+
+#endif  // !CPU_ONLY
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index af96cac4..e53a5c0d 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -85,7 +85,8 @@ void* Caffe::RNG::generator() {
 #else  // Normal GPU + CPU Caffe.
 
 Caffe::Caffe()
-    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
+{
+/*    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
     mode_(Caffe::CPU) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
@@ -99,18 +100,20 @@ Caffe::Caffe()
       != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
+*/
 }
 
 Caffe::~Caffe() {
-  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+ /* if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
   if (curand_generator_) {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
+*/
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
   // Curand seed
-  static bool g_curand_availability_logged = false;
+ /* static bool g_curand_availability_logged = false;
   if (Get().curand_generator_) {
     CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(),
         seed));
@@ -124,10 +127,11 @@ void Caffe::set_random_seed(const unsigned int seed) {
   }
   // RNG seed
   Get().random_generator_.reset(new RNG(seed));
+*/
 }
 
 void Caffe::SetDevice(const int device_id) {
-  int current_device;
+ /* int current_device;
   CUDA_CHECK(cudaGetDevice(&current_device));
   if (current_device == device_id) {
     return;
@@ -144,10 +148,11 @@ void Caffe::SetDevice(const int device_id) {
       CURAND_RNG_PSEUDO_DEFAULT));
   CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
       cluster_seedgen()));
+*/
 }
 
 void Caffe::DeviceQuery() {
-  cudaDeviceProp prop;
+  /*cudaDeviceProp prop;
   int device;
   if (cudaSuccess != cudaGetDevice(&device)) {
     printf("No cuda device present.\n");
@@ -179,6 +184,7 @@ void Caffe::DeviceQuery() {
   LOG(INFO) << "Kernel execution timeout:      "
       << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
   return;
+*/
 }
 
 
@@ -205,7 +211,7 @@ void* Caffe::RNG::generator() {
 }
 
 const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
+ /* switch (error) {
   case CUBLAS_STATUS_SUCCESS:
     return "CUBLAS_STATUS_SUCCESS";
   case CUBLAS_STATUS_NOT_INITIALIZED:
@@ -231,11 +237,12 @@ const char* cublasGetErrorString(cublasStatus_t error) {
     return "CUBLAS_STATUS_LICENSE_ERROR";
 #endif
   }
+*/
   return "Unknown cublas status";
 }
 
 const char* curandGetErrorString(curandStatus_t error) {
-  switch (error) {
+  /*switch (error) {
   case CURAND_STATUS_SUCCESS:
     return "CURAND_STATUS_SUCCESS";
   case CURAND_STATUS_VERSION_MISMATCH:
@@ -263,6 +270,7 @@ const char* curandGetErrorString(curandStatus_t error) {
   case CURAND_STATUS_INTERNAL_ERROR:
     return "CURAND_STATUS_INTERNAL_ERROR";
   }
+*/
   return "Unknown curand status";
 }
 
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
new file mode 100644
index 00000000..7c564589
--- /dev/null
+++ b/src/caffe/device.cpp
@@ -0,0 +1,421 @@
+#include "caffe/common.hpp"
+#include "caffe/device.hpp"
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
+namespace caffe {
+//delete it after test, Yibing
+cl_mem test_alloc_mem[10];
+extern long long unsigned device_mem_consumption;
+
+Device amdDevice;
+char* buildOption = "-x clc++ ";
+
+Device::~Device(){
+    //clAmdBlasTeardown(); 
+     free((void*)platformIDs);
+     free(DeviceIDs);
+     clReleaseProgram(Program);
+     clReleaseCommandQueue(CommandQueue);
+     clReleaseCommandQueue(CommandQueue_helper);
+     clReleaseContext(Context);
+     LOG(INFO) << "device destructor";
+}
+
+
+cl_int Device::Init(){
+
+    //Get Platform Infomation
+    DisplayPlatformInfo();
+  
+    clGetPlatformIDs(0, NULL, &numPlatforms);
+    cl_platform_id PlatformIDs[numPlatforms];
+    clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+    
+    size_t nameLen;
+    cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
+    if(res != CL_SUCCESS){
+        fprintf(stderr, "Err: Failed to Get Platform Info\n", res);
+        return 0;
+    }
+    platformName[nameLen] = 0;
+
+    //Get OpenCL Information 
+    //res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_VERSION, 64, openclVersion, &nameLen);
+    //if(res != CL_SUCCESS) {
+    //    fprintf(stderr, "Err: Get OpenCL Info failed!\n", res);
+    //    return 0;
+    //}
+    //openclVersion[nameLen] = 0;
+    //printf("%s %s\n", platformName, openclVersion);
+  
+    GetDeviceInfo();
+    cl_device_id * pDevices;
+    cl_uint uiNumDevices;
+    cl_bool unified_memory = false;
+    switch(Caffe::mode()) {
+    case Caffe::GPU:
+         //choose_gpu();
+      clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+      uiNumDevices = numDevices;
+      if(0 == uiNumDevices){
+        LOG(FATAL) << "Err: No GPU devices";
+       }
+       else{
+        pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
+        OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
+        for (int i = 0; i < (int)uiNumDevices; i++){
+          clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
+          if(unified_memory) //skip iGPU
+            continue;
+          else {//we pick the first GPU we found
+           pDevices[0] = pDevices[i];
+            }
+         }
+       }
+         LOG(INFO) << "picked device type: GPU";
+         break;
+    case Caffe::CPU:
+         //choose_cpu();
+         clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
+         uiNumDevices = numDevices;
+        if(0 == uiNumDevices){
+          LOG(FATAL) << "Err: No CPU devices";
+          }
+         pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
+         OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) );
+         LOG(INFO) << "picked device type: CPU";
+         break;
+    case Caffe::APU:
+        clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+        uiNumDevices = numDevices;
+        if(0 == uiNumDevices){
+          LOG(FATAL) << "Err: No GPU devices";
+         }
+         else{
+          pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
+          OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
+          for (int i = 0; i < (int)uiNumDevices; i++){
+            clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
+             if(unified_memory) //we pick the first GPU we found
+              pDevices[0] = pDevices[i];
+             else {//skip dGPU
+               continue;
+               }
+         }
+       }
+         LOG(INFO) << "picked device type: APU";
+         break;
+    default:
+         LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+    }
+
+    //Create Context
+    Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
+    if(NULL == Context){
+        fprintf(stderr,"Err: Failed to Create Context\n");
+        return 0;
+    }
+
+    //Create CommandQueue
+    CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
+    CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
+    if(NULL == CommandQueue || NULL == CommandQueue_helper){
+        fprintf(stderr,"Err: Failed to Create Commandqueue\n");
+        return 0;
+    }
+
+    //Read our own kernel file
+    const char *pFileName = "../../src/caffe/OCL_kernel.cl";
+    const char *pSource;
+    std::string strSource = "";
+    ConvertToString(pFileName, strSource);
+    pSource = strSource.c_str();
+    size_t uiArrSourceSize[] = {0};
+    uiArrSourceSize[0] = strlen(pSource);
+    Program = NULL;
+    Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
+    if(NULL == Program){
+        fprintf(stderr,"Err: Failed to create program\n");
+    }
+
+    //Build Program
+    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
+    LOG(INFO) << "Build Program";
+    if(CL_SUCCESS != iStatus){
+        fprintf(stderr,"Err: Failed to build program\n");
+        char szBuildLog[16384];
+        clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
+        std::cout << szBuildLog;
+        clReleaseProgram(Program);
+    }
+
+    /*
+    //Setup AmdBlas;
+    cl_int err;
+    err = clAmdBlasSetup();
+    if(err != CL_SUCCESS){
+        printf("clAmdBlasSetup() failed with %d\n", err);
+    }
+    */
+    row = clblasRowMajor;
+    col = clblasColumnMajor;
+	
+	/* 
+	//delete after test the large buffer allocation, Yibing	
+	long long global_mem_size_limit = 1024*1024; //4*1024*1024*1024;
+	global_mem_size_limit *= (long long)(0.0*1024.0);
+	//global_mem_size_limit = 16834887680/2;
+	long long available_global_mem_size = 1024*1024;
+        available_global_mem_size *= 20*1024; 
+	
+	long long global_mem_malloc_size_limit = 1024*1024;
+	while(available_global_mem_size > global_mem_size_limit){
+		long long size_;
+		if((available_global_mem_size - global_mem_size_limit) >= global_mem_malloc_size_limit){
+			size_ = global_mem_malloc_size_limit;
+		}else{
+			size_ = available_global_mem_size - global_mem_size_limit;
+		}
+		available_global_mem_size = available_global_mem_size - size_;
+		int *tmpData = (int *)malloc(size_);
+		cl_int err;
+                int i = 0;
+		test_alloc_mem[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, size_, NULL, &err);
+        	err = clEnqueueWriteBuffer(CommandQueue, test_alloc_mem[i], CL_TRUE, 0, size_, tmpData, 0, NULL, NULL);
+		i++;
+                device_mem_consumption += size_;
+                //printf("self alloc, device_mem_consumption = %lu\n", device_mem_consumption);
+		if(err != CL_SUCCESS) {
+                	printf("Large Buffer Allocation  failed! error_code = %d\n", err);
+                	printf("self alloc, device_mem_consumption = %llu\n", device_mem_consumption);
+                	exit(1);
+        	}
+                
+		cl_ulong free_mem_size, mem_size;
+                cl_int err1 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,sizeof(cl_ulong),&free_mem_size,NULL);
+                cl_int err2 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&mem_size,NULL);
+                //std::cout<<"free memory size after allocation = "<<free_mem_size<<",err_code ="<<err1<<std::endl;
+                //std::cout<<"global memory size = "<<mem_size<<",err_code ="<<err2<<std::endl;
+        	
+		free(tmpData);
+	}*/
+
+    return 0;
+}
+
+
+//Use to read OpenCL source code
+cl_int Device::ConvertToString(const char *pFileName,std::string &Str){
+    size_t uiSize=0;
+    size_t uiFileSize=0;
+    char *pStr=NULL;
+    std::fstream fFile(pFileName,(std::fstream::in|std::fstream::binary));
+    if(fFile.is_open()){
+        fFile.seekg(0,std::fstream::end);
+        uiSize=uiFileSize=(size_t)fFile.tellg();
+        fFile.seekg(0,std::fstream::beg);
+        pStr=new char[uiSize+1];
+
+        if(NULL==pStr){
+            fFile.close();
+            return 0;
+        }
+        fFile.read(pStr,uiFileSize);
+        fFile.close();
+        pStr[uiSize]='\0';
+        Str=pStr;
+        delete[] pStr;
+        return 0;
+    }
+    LOG(ERROR) << "Err: Failed to open cl file!";
+    return -1;
+}
+
+void Device::DisplayPlatformInfo(){
+   cl_int err;
+   size_t size;
+
+   err = clGetPlatformIDs (0, NULL, &numPlatforms);
+   if(err != CL_SUCCESS || numPlatforms <=0)
+   {
+      LOG(ERROR) << "Failed to find any OpenCL platform.";
+      return;
+   }
+
+   platformIDs = (cl_platform_id *) malloc (sizeof(cl_platform_id) * numPlatforms);
+   err = clGetPlatformIDs (numPlatforms, platformIDs, NULL);
+   if(err != CL_SUCCESS)
+   {
+      LOG(ERROR) << "Failed to find any OpenCL platform.";
+      return;
+   }
+
+   LOG(INFO) << "Number of platforms found:" << numPlatforms;
+
+  //iterate through the list of platforms displaying platform information
+  for (cl_uint i = 0; i < numPlatforms; i++ ){
+  DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
+  DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
+  DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
+  DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
+  DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS");
+  }
+   
+}
+
+void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str){
+    cl_int err;
+    std::size_t paramValueSize;
+
+    err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);  
+   if(err != CL_SUCCESS)
+   {
+      LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+      return;
+   }
+   
+   char * info = (char *) alloca (sizeof(char) * paramValueSize);
+   err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
+   if(err != CL_SUCCESS)
+   {
+      LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+      return;
+   }
+
+   LOG(INFO) << "\t" << str << "\t" << info;
+}
+
+void Device::GetDeviceInfo(){
+    cl_int err;
+    //by default, we select the first platform. can be extended for more platforms
+    //query GPU device for now
+    err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+    // we allow program run if no GPU is found. Just return. No error reported.
+    if (numDevices < 1)
+    {
+      LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
+      LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
+      return;
+    }
+    
+    DeviceIDs = (cl_device_id *) malloc (sizeof(cl_device_id) * numDevices);
+    err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, DeviceIDs, NULL);
+    if(err != CL_SUCCESS)
+    {
+      LOG(INFO) << "Failed to find any GPU devices.";
+      return;
+    }
+
+    LOG(INFO) << "Number of devices found:" << numDevices;
+    for(cl_uint i = 0; i < numDevices; i++){
+    LOG(INFO) << "\t" << "DeviceID" << ":\t" <<DeviceIDs[i];
+    DisplayDeviceInfo<cl_device_type>(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+    DisplayDeviceInfo<size_t>(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+    DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes");
+    DisplayDeviceInfo<cl_command_queue_properties>(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+    DisplayDeviceInfo<cl_device_exec_capabilities>(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+    }
+    
+    
+}
+
+template <typename T>
+void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){
+    cl_int err;
+    std::size_t paramValueSize;
+
+    err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);  
+   if(err != CL_SUCCESS)
+   {
+      LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+      return;
+   }
+  
+   std::string content; 
+   T * info = (T *) alloca (sizeof(T) * paramValueSize);
+   err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
+   if(err != CL_SUCCESS)
+   {
+      LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+      return;
+   }
+
+
+   switch(name)
+{
+    case CL_DEVICE_TYPE:
+    {
+        std::string deviceType;
+        appendBitfield<cl_device_type>(
+        *(reinterpret_cast<cl_device_type*>(info)),CL_DEVICE_TYPE_CPU,"CL_DEVICE_TYPE_CPU",deviceType);
+
+        appendBitfield<cl_device_type>(
+        *(reinterpret_cast<cl_device_type*>(info)),CL_DEVICE_TYPE_GPU,"CL_DEVICE_TYPE_GPU",deviceType);
+
+        appendBitfield<cl_device_type>(
+        *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_ACCELERATOR,"CL_DEVICE_TYPE_ACCELERATOR",deviceType);
+
+        appendBitfield<cl_device_type>(
+        *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_DEFAULT,"CL_DEVICE_TYPE_DEFAULT",deviceType);
+        
+	LOG(INFO) << "\t " << str << ":\t" << deviceType;
+    }
+        break;
+    case CL_DEVICE_EXECUTION_CAPABILITIES:
+    {
+        std::string memType;
+        appendBitfield<cl_device_exec_capabilities>(
+        *(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_EXEC_KERNEL,"CL_EXEC_KERNEL",memType);
+
+        appendBitfield<cl_device_exec_capabilities>(
+        *(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_EXEC_NATIVE_KERNEL,"CL_EXEC_NATIVE_KERNEL",memType);
+
+        LOG(INFO) << "\t " << str << ":\t" << memType;
+
+    }
+       break;
+    case CL_DEVICE_QUEUE_PROPERTIES:
+        {
+            std::string memType;
+            appendBitfield<cl_device_exec_capabilities>(*(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,"CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE",memType);
+
+            appendBitfield<cl_device_exec_capabilities>(*(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_QUEUE_PROFILING_ENABLE,"CL_QUEUE_PROFILING_ENABLE",memType);
+
+            LOG(INFO) << "\t " << str << ":\t" << memType;
+        }
+        break;
+    default:
+        LOG(INFO) << "\t" << str << ":\t" << *info;
+        break;
+}
+
+}
+
+template<typename T>
+void Device::appendBitfield(T info, T value , std::string name , std::string &str)
+{
+    if(info & value)
+    {
+        if (str.length() > 0)
+        {
+            str.append(" | ");
+        }
+        str.append(name);
+    }
+}
+
+
+}  // namespace caffe
+
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 928ef5ee..b73f1a93 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -67,6 +67,16 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(ConvolutionLayer);
 #endif
diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu
similarity index 100%
rename from src/caffe/layers/absval_layer.cu
rename to src/caffe/layers/cufiles/absval_layer.cu
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu
similarity index 100%
rename from src/caffe/layers/base_data_layer.cu
rename to src/caffe/layers/cufiles/base_data_layer.cu
diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu
similarity index 100%
rename from src/caffe/layers/bnll_layer.cu
rename to src/caffe/layers/cufiles/bnll_layer.cu
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu
similarity index 100%
rename from src/caffe/layers/concat_layer.cu
rename to src/caffe/layers/cufiles/concat_layer.cu
diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu
similarity index 100%
rename from src/caffe/layers/contrastive_loss_layer.cu
rename to src/caffe/layers/cufiles/contrastive_loss_layer.cu
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu
similarity index 100%
rename from src/caffe/layers/conv_layer.cu
rename to src/caffe/layers/cufiles/conv_layer.cu
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_conv_layer.cu
rename to src/caffe/layers/cufiles/cudnn_conv_layer.cu
diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_pooling_layer.cu
rename to src/caffe/layers/cufiles/cudnn_pooling_layer.cu
diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_relu_layer.cu
rename to src/caffe/layers/cufiles/cudnn_relu_layer.cu
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_sigmoid_layer.cu
rename to src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_softmax_layer.cu
rename to src/caffe/layers/cufiles/cudnn_softmax_layer.cu
diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu
similarity index 100%
rename from src/caffe/layers/cudnn_tanh_layer.cu
rename to src/caffe/layers/cufiles/cudnn_tanh_layer.cu
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu
similarity index 100%
rename from src/caffe/layers/deconv_layer.cu
rename to src/caffe/layers/cufiles/deconv_layer.cu
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu
similarity index 100%
rename from src/caffe/layers/dropout_layer.cu
rename to src/caffe/layers/cufiles/dropout_layer.cu
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu
similarity index 100%
rename from src/caffe/layers/eltwise_layer.cu
rename to src/caffe/layers/cufiles/eltwise_layer.cu
diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu
similarity index 100%
rename from src/caffe/layers/euclidean_loss_layer.cu
rename to src/caffe/layers/cufiles/euclidean_loss_layer.cu
diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu
similarity index 100%
rename from src/caffe/layers/exp_layer.cu
rename to src/caffe/layers/cufiles/exp_layer.cu
diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu
similarity index 100%
rename from src/caffe/layers/filter_layer.cu
rename to src/caffe/layers/cufiles/filter_layer.cu
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu
similarity index 100%
rename from src/caffe/layers/hdf5_data_layer.cu
rename to src/caffe/layers/cufiles/hdf5_data_layer.cu
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu
similarity index 100%
rename from src/caffe/layers/hdf5_output_layer.cu
rename to src/caffe/layers/cufiles/hdf5_output_layer.cu
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu
similarity index 100%
rename from src/caffe/layers/im2col_layer.cu
rename to src/caffe/layers/cufiles/im2col_layer.cu
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu
similarity index 100%
rename from src/caffe/layers/inner_product_layer.cu
rename to src/caffe/layers/cufiles/inner_product_layer.cu
diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu
similarity index 100%
rename from src/caffe/layers/log_layer.cu
rename to src/caffe/layers/cufiles/log_layer.cu
diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu
similarity index 100%
rename from src/caffe/layers/lrn_layer.cu
rename to src/caffe/layers/cufiles/lrn_layer.cu
diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu
similarity index 100%
rename from src/caffe/layers/mvn_layer.cu
rename to src/caffe/layers/cufiles/mvn_layer.cu
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu
similarity index 100%
rename from src/caffe/layers/pooling_layer.cu
rename to src/caffe/layers/cufiles/pooling_layer.cu
diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu
similarity index 100%
rename from src/caffe/layers/power_layer.cu
rename to src/caffe/layers/cufiles/power_layer.cu
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu
similarity index 100%
rename from src/caffe/layers/prelu_layer.cu
rename to src/caffe/layers/cufiles/prelu_layer.cu
diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu
similarity index 100%
rename from src/caffe/layers/reduction_layer.cu
rename to src/caffe/layers/cufiles/reduction_layer.cu
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu
similarity index 100%
rename from src/caffe/layers/relu_layer.cu
rename to src/caffe/layers/cufiles/relu_layer.cu
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
similarity index 100%
rename from src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
rename to src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu
similarity index 100%
rename from src/caffe/layers/sigmoid_layer.cu
rename to src/caffe/layers/cufiles/sigmoid_layer.cu
diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu
similarity index 100%
rename from src/caffe/layers/silence_layer.cu
rename to src/caffe/layers/cufiles/silence_layer.cu
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu
similarity index 100%
rename from src/caffe/layers/slice_layer.cu
rename to src/caffe/layers/cufiles/slice_layer.cu
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu
similarity index 100%
rename from src/caffe/layers/softmax_layer.cu
rename to src/caffe/layers/cufiles/softmax_layer.cu
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu
similarity index 100%
rename from src/caffe/layers/softmax_loss_layer.cu
rename to src/caffe/layers/cufiles/softmax_loss_layer.cu
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu
similarity index 100%
rename from src/caffe/layers/split_layer.cu
rename to src/caffe/layers/cufiles/split_layer.cu
diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu
similarity index 100%
rename from src/caffe/layers/tanh_layer.cu
rename to src/caffe/layers/cufiles/tanh_layer.cu
diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu
similarity index 100%
rename from src/caffe/layers/threshold_layer.cu
rename to src/caffe/layers/cufiles/threshold_layer.cu
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index ec1256fd..7f1ac8f6 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -67,6 +67,16 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 
 #ifdef CPU_ONLY
 STUB_GPU(DropoutLayer);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index c8d41499..d5207889 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -309,6 +309,15 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
 
 #ifdef CPU_ONLY
 STUB_GPU(PoolingLayer);
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index cc00319a..e05080bf 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -36,6 +36,16 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 
 #ifdef CPU_ONLY
 STUB_GPU(ReLULayer);
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 7617ccfb..200ca657 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -7,7 +7,7 @@
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
-  if (cpu_ptr_ && own_cpu_data_) {
+/*  if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_);
   }
 
@@ -16,10 +16,11 @@ SyncedMemory::~SyncedMemory() {
     CUDA_CHECK(cudaFree(gpu_ptr_));
   }
 #endif  // CPU_ONLY
+*/
 }
 
 inline void SyncedMemory::to_cpu() {
-  switch (head_) {
+ /* switch (head_) {
   case UNINITIALIZED:
     CaffeMallocHost(&cpu_ptr_, size_);
     caffe_memset(size_, 0, cpu_ptr_);
@@ -42,9 +43,11 @@ inline void SyncedMemory::to_cpu() {
   case SYNCED:
     break;
   }
+*/
 }
 
 inline void SyncedMemory::to_gpu() {
+/*
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
@@ -66,6 +69,7 @@ inline void SyncedMemory::to_gpu() {
 #else
   NO_GPU;
 #endif
+*/
 }
 
 const void* SyncedMemory::cpu_data() {
@@ -74,13 +78,14 @@ const void* SyncedMemory::cpu_data() {
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-  CHECK(data);
+  /*CHECK(data);
   if (own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_);
   }
   cpu_ptr_ = data;
   head_ = HEAD_AT_CPU;
   own_cpu_data_ = false;
+*/
 }
 
 const void* SyncedMemory::gpu_data() {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 0aab6b17..6cbf208d 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,3 +1,5 @@
+// Copyright 2014 BVLC and contributors.
+
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -7,6 +9,9 @@
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
 
+static const clblasOrder order = clblasColumnMajor;
+#define pi 3.1415926
+
 namespace caffe {
 
 template<>
@@ -31,6 +36,92 @@ void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
       ldb, beta, C, N);
 }
 
+template <>
+void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
+
+template <>
+void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
+
+template <>
+cl_event caffe_gpu_gemm_ex<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
+    cl_event event;
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) );
+    return event;
+}
+
+template <>
+cl_event caffe_gpu_gemm_ex<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
+    cl_event event;
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) );
+    return event;
+}
+
+
+template <>
+cl_event caffe_gpu_gemmex<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
+    cl_event event;
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
+    return event;
+ }
+
+template <>
+cl_event caffe_gpu_gemmex<double>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
+    cl_event event;
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
+    return event;
+}
+
 template <>
 void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const float alpha, const float* A, const float* x,
@@ -45,6 +136,42 @@ void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
   cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
+template <>
+void caffe_gpu_gemvv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, size_t offA, int lda, 
+    const float* x, size_t offx, const float beta, int incx, 
+    float* y, size_t offy, int incy) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA,
+                                  M, N, (cl_float)alpha, (cl_mem)A, offA, lda,
+                                  (cl_mem)x, offx, incx, (cl_float)beta, 
+                                  (cl_mem)y, offy, incy,
+                                  1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
+
+template <>
+void caffe_gpu_gemvv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, size_t offA, int lda,
+    const double* x, size_t offx, const double beta, int incx,
+    double* y, size_t offy, int incy) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+
+}
+
+
+template <>
+void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+}
+
+template <>
+void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+}
+
 template <>
 void caffe_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
@@ -53,10 +180,22 @@ template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 
-template <typename Dtype>
-void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+template <>
+void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) {
+    CLBLAS_CHECK( clblasSaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) );
+}
+
+template <>
+void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) {
+    CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) );
+}
+
+template <>
+void caffe_set(const int N, const float alpha, float* Y) {
   if (alpha == 0) {
-    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    memset(Y, 0, sizeof(float) * N);
     return;
   }
   for (int i = 0; i < N; ++i) {
@@ -64,9 +203,16 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
   }
 }
 
-template void caffe_set<int>(const int N, const int alpha, int* Y);
-template void caffe_set<float>(const int N, const float alpha, float* Y);
-template void caffe_set<double>(const int N, const double alpha, double* Y);
+template <>
+void caffe_set(const int N, const double alpha, double* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(double) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
 
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
@@ -82,27 +228,26 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) {
   }
 }
 
-template <typename Dtype>
-void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
-  if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
-#else
-      NO_GPU;
-#endif
-    } else {
-      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
-  }
+template <>
+void caffe_copy<float>(const int N, const float* X, float* Y) {
+  cblas_scopy(N, X, 1, Y, 1);
 }
 
-template void caffe_copy<int>(const int N, const int* X, int* Y);
-template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-    unsigned int* Y);
-template void caffe_copy<float>(const int N, const float* X, float* Y);
-template void caffe_copy<double>(const int N, const double* X, double* Y);
+template <>
+void caffe_copy<double>(const int N, const double* X, double* Y) {
+  cblas_dcopy(N, X, 1, Y, 1);
+}
+
+template <>
+void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
+  CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+
+}
+
+template <>
+void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
+  CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
 
 template <>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
@@ -114,6 +259,30 @@ void caffe_scal<double>(const int N, const double alpha, double *X) {
   cblas_dscal(N, alpha, X, 1);
 }
 
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
+   CLBLAS_CHECK(clblasSscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
+  CLBLAS_CHECK(clblasDscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  caffe_gpu_scal<float>(N, beta, Y);
+  caffe_gpu_axpy<float>(N, alpha, X, Y);
+}
+
+template <>
+void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+  caffe_gpu_scal<double>(N, beta, Y);
+  caffe_gpu_axpy<double>(N, alpha, X, Y);
+}
+
 template <>
 void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
                             const float beta, float* Y) {
@@ -206,26 +375,6 @@ void caffe_exp<double>(const int n, const double* a, double* y) {
   vdExp(n, a, y);
 }
 
-template <>
-void caffe_log<float>(const int n, const float* a, float* y) {
-  vsLn(n, a, y);
-}
-
-template <>
-void caffe_log<double>(const int n, const double* a, double* y) {
-  vdLn(n, a, y);
-}
-
-template <>
-void caffe_abs<float>(const int n, const float* a, float* y) {
-    vsAbs(n, a, y);
-}
-
-template <>
-void caffe_abs<double>(const int n, const double* a, double* y) {
-    vdAbs(n, a, y);
-}
-
 unsigned int caffe_rng_rand() {
   return (*caffe_rng())();
 }
@@ -253,6 +402,8 @@ void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
+
+  //LOG(INFO) << "caffe_rng_uniform";
 }
 
 template
@@ -272,9 +423,11 @@ void caffe_rng_gaussian(const int n, const Dtype a,
   boost::normal_distribution<Dtype> random_distribution(a, sigma);
   boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
       variate_generator(caffe_rng(), random_distribution);
+      //variate_generator(37, random_distribution);
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
+  //LOG(INFO) << "caffe_rng_guassian";
 }
 
 template
@@ -297,6 +450,7 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
+  //LOG(INFO) << "caffe_rng_bernoulli";
 }
 
 template
@@ -304,50 +458,31 @@ void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
 
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
-
-template <typename Dtype>
-void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GE(p, 0);
-  CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = static_cast<unsigned int>(variate_generator());
-  }
+//
+template <>
+float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
 }
 
-template
-void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
-
-template
-void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
-
 template <>
-float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-    const float* y, const int incy) {
-  return cblas_sdot(n, x, incx, y, incy);
+double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
 }
 
 template <>
-double caffe_cpu_strided_dot<double>(const int n, const double* x,
-    const int incx, const double* y, const int incy) {
-  return cblas_ddot(n, x, incx, y, incy);
+void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
+    float* out) {
+  //need to pass in scratchBuff
+  //AMDBLAS_CHECK(clAmdBlasSdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template <typename Dtype>
-Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
-  return caffe_cpu_strided_dot(n, x, 1, y, 1);
+template <>
+void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
+    double * out) {
+  //need to pass in scratchBuff
+  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template
-float caffe_cpu_dot<float>(const int n, const float* x, const float* y);
-
-template
-double caffe_cpu_dot<double>(const int n, const double* x, const double* y);
-
 template <>
 int caffe_cpu_hamming_distance<float>(const int n, const float* x,
                                   const float* y) {
@@ -380,6 +515,18 @@ double caffe_cpu_asum<double>(const int n, const double* x) {
   return cblas_dasum(n, x, 1);
 }
 
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
+}
+
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
+
 template <>
 void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
                             float* y) {
@@ -394,4 +541,129 @@ void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
   cblas_dscal(n, alpha, y, 1);
 }
 
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+}
+
+template <typename Dtype>
+void set_kernel(const int n, const Dtype alpha, Dtype* y) {
+}
+
+template <>
+void caffe_gpu_set(const int N, const float alpha, float* Y) {
+  if (alpha == 0) {
+    return;
+  }
+}
+
+template <>
+void caffe_gpu_set(const int N, const double alpha, double* Y) {
+  if (alpha == 0) {
+    return;
+  }
+}
+
+template <typename Dtype>
+void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
+}
+
+template <>
+void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
+}
+
+template <>
+void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
+}
+
+template <typename Dtype>
+void mul_kernel(const int n, const Dtype* a,
+    const Dtype* b, Dtype* y) {
+}
+
+template <>
+void caffe_gpu_mul<float>(const int N, const float* a,
+    const float* b, float* y) {
+}
+
+template <>
+void caffe_gpu_mul<double>(const int N, const double* a,
+    const double* b, double* y) {
+}
+
+template <typename Dtype>
+void div_kernel(const int n, const Dtype* a,
+    const Dtype* b, Dtype* y) {
+}
+
+template <>
+void caffe_gpu_div<float>(const int N, const float* a,
+    const float* b, float* y) {
+}
+
+template <>
+void caffe_gpu_div<double>(const int N, const double* a,
+    const double* b, double* y) {
+}
+
+template <typename Dtype>
+void powx_kernel(const int n, const Dtype* a,
+    const Dtype alpha, Dtype* y) {
+}
+
+template <>
+void caffe_gpu_powx<float>(const int N, const float* a,
+    const float alpha, float* y) {
+}
+
+template <>
+void caffe_gpu_powx<double>(const int N, const double* a,
+    const double alpha, double* y) {
+}
+
+
+void popc_kernel(const int n, const float* a,
+    const float* b, uint8_t* y) {
+}
+
+void popcll_kernel(const int n, const double* a,
+    const double* b, uint8_t* y) {
+}
+
+template <>
+uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
+                                  const float* y) {
+}
+
+template <>
+uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
+                                   const double* y) {
+}
+
+void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
+}
+
+template <>
+void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
+                                  float* r) {
+}
+template <>
+void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
+                                   double* r) {
+}
+
+template <>
+void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
+                            float* r) {
+}
+
+template <>
+void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
+                            double* r) {
+}
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp.protect b/src/caffe/util/math_functions.cpp.protect
new file mode 100644
index 00000000..166b709a
--- /dev/null
+++ b/src/caffe/util/math_functions.cpp.protect
@@ -0,0 +1,413 @@
+#include <boost/math/special_functions/next.hpp>
+#include <boost/random.hpp>
+
+#include <limits>
+#include <clBLAS.h>
+
+#include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
+
+
+namespace caffe {
+
+template<>
+void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+      ldb, beta, C, N);
+}
+
+template<>
+void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+      ldb, beta, C, N);
+}
+
+template <>
+void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
+
+template <>
+void caffe_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template void caffe_set<int>(const int N, const int alpha, int* Y);
+template void caffe_set<float>(const int N, const float alpha, float* Y);
+template void caffe_set<double>(const int N, const double alpha, double* Y);
+
+template <>
+void caffe_add_scalar(const int N, const float alpha, float* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <>
+void caffe_add_scalar(const int N, const double alpha, double* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+    if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+      // NOLINT_NEXT_LINE(caffe/alt_fn)
+      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+#else
+      NO_GPU;
+#endif
+    } else {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    }
+  }
+}
+
+template void caffe_copy<int>(const int N, const int* X, int* Y);
+template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
+    unsigned int* Y);
+template void caffe_copy<float>(const int N, const float* X, float* Y);
+template void caffe_copy<double>(const int N, const double* X, double* Y);
+
+template <>
+void caffe_scal<float>(const int N, const float alpha, float *X) {
+  cblas_sscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_scal<double>(const int N, const double alpha, double *X) {
+  cblas_dscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_add<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsAdd(n, a, b, y);
+}
+
+template <>
+void caffe_add<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdAdd(n, a, b, y);
+}
+
+template <>
+void caffe_sub<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsSub(n, a, b, y);
+}
+
+template <>
+void caffe_sub<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdSub(n, a, b, y);
+}
+
+template <>
+void caffe_mul<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsMul(n, a, b, y);
+}
+
+template <>
+void caffe_mul<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdMul(n, a, b, y);
+}
+
+template <>
+void caffe_div<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsDiv(n, a, b, y);
+}
+
+template <>
+void caffe_div<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdDiv(n, a, b, y);
+}
+
+template <>
+void caffe_powx<float>(const int n, const float* a, const float b,
+    float* y) {
+  vsPowx(n, a, b, y);
+}
+
+template <>
+void caffe_powx<double>(const int n, const double* a, const double b,
+    double* y) {
+  vdPowx(n, a, b, y);
+}
+
+template <>
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  vsSqr(n, a, y);
+}
+
+template <>
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  vdSqr(n, a, y);
+}
+
+template <>
+void caffe_exp<float>(const int n, const float* a, float* y) {
+  vsExp(n, a, y);
+}
+
+template <>
+void caffe_exp<double>(const int n, const double* a, double* y) {
+  vdExp(n, a, y);
+}
+
+template <>
+void caffe_log<float>(const int n, const float* a, float* y) {
+  vsLn(n, a, y);
+}
+
+template <>
+void caffe_log<double>(const int n, const double* a, double* y) {
+  vdLn(n, a, y);
+}
+
+template <>
+void caffe_abs<float>(const int n, const float* a, float* y) {
+    vsAbs(n, a, y);
+}
+
+template <>
+void caffe_abs<double>(const int n, const double* a, double* y) {
+    vdAbs(n, a, y);
+}
+
+unsigned int caffe_rng_rand() {
+  return (*caffe_rng())();
+}
+
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype>(
+      b, std::numeric_limits<Dtype>::max());
+}
+
+template
+float caffe_nextafter(const float b);
+
+template
+double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_uniform<float>(const int n, const float a, const float b,
+                              float* r);
+
+template
+void caffe_rng_uniform<double>(const int n, const double a, const double b,
+                               double* r);
+
+template <typename Dtype>
+void caffe_rng_gaussian(const int n, const Dtype a,
+                        const Dtype sigma, Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  boost::normal_distribution<Dtype> random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_gaussian<float>(const int n, const float mu,
+                               const float sigma, float* r);
+
+template
+void caffe_rng_gaussian<double>(const int n, const double mu,
+                                const double sigma, double* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
+
+template
+void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = static_cast<unsigned int>(variate_generator());
+  }
+}
+
+template
+void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+
+template
+void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
+
+template <>
+float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
+}
+
+template <>
+double caffe_cpu_strided_dot<double>(const int n, const double* x,
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
+}
+
+template <typename Dtype>
+Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
+  return caffe_cpu_strided_dot(n, x, 1, y, 1);
+}
+
+template
+float caffe_cpu_dot<float>(const int n, const float* x, const float* y);
+
+template
+double caffe_cpu_dot<double>(const int n, const double* x, const double* y);
+
+template <>
+int caffe_cpu_hamming_distance<float>(const int n, const float* x,
+                                  const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
+                               static_cast<uint32_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+int caffe_cpu_hamming_distance<double>(const int n, const double* x,
+                                   const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
+                                static_cast<uint64_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+float caffe_cpu_asum<float>(const int n, const float* x) {
+  return cblas_sasum(n, x, 1);
+}
+
+template <>
+double caffe_cpu_asum<double>(const int n, const double* x) {
+  return cblas_dasum(n, x, 1);
+}
+
+template <>
+void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
+
+}  // namespace caffe

From 3965d0c242d9754e594054ffe784996ca08a51cd Mon Sep 17 00:00:00 2001
From: Yibing <yibing.liu@amd.com>
Date: Sat, 11 Jul 2015 14:14:07 +0800
Subject: [PATCH 002/124] Synced memory changes

---
 include/caffe/blob.hpp               |    1 +
 include/caffe/syncedmem.hpp          |   18 +-
 src/caffe/OCL_kernel.cl              | 1416 ++++++++++++++++++++++++++
 src/caffe/blob.cpp                   |    6 +
 src/caffe/common.cpp                 |    5 +
 src/caffe/layers/base_data_layer.cpp |   20 +
 src/caffe/syncedmem.cpp              |   88 +-
 7 files changed, 1531 insertions(+), 23 deletions(-)
 create mode 100644 src/caffe/OCL_kernel.cl

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 472cc184..160539aa 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -220,6 +220,7 @@ class Blob {
   const Dtype* cpu_data() const;
   void set_cpu_data(Dtype* data);
   const Dtype* gpu_data() const;
+  const Dtype* gpu_cache_data() const;
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
   Dtype* mutable_cpu_data();
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 1b726de9..0bcad1dc 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -42,29 +42,41 @@ class SyncedMemory {
  public:
   SyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
+        own_cpu_data_(false), is_data_layer_(false) {
+        ocl_setup();
+        }
   explicit SyncedMemory(size_t size)
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
+        own_cpu_data_(false), data_layer_(false) {
+        ocl_setup();
+        }
+
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
   const void* gpu_data();
+  //const void* gpu_cache_data();
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
   SyncedHead head() { return head_; }
   size_t size() { return size_; }
+  void set_data_layer(){ data_layer_ = true; }
+ private:
+   void ocl_setup();
+ protected:
+   cl_kernel oclmem_kernel;
 
  private:
   void to_cpu();
   void to_gpu();
   void* cpu_ptr_;
   void* gpu_ptr_;
+  void* gpu_cache_ptr_;
   size_t size_;
   SyncedHead head_;
   bool own_cpu_data_;
-
+  bool data_layer_;
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
 
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
new file mode 100644
index 00000000..980dc37c
--- /dev/null
+++ b/src/caffe/OCL_kernel.cl
@@ -0,0 +1,1416 @@
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+//beginning of the looooooong gpu_random_generator kernel 
+//we use the open sourced threefry's GPU implementation
+typedef uint uint32_t;
+
+struct r123array4x32 {	uint32_t v[4]; };
+
+enum r123_enum_threefry32x4 
+{
+	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
+	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
+	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
+	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
+	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
+	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
+	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
+	R_32x4_7_0 = 18, R_32x4_7_1 = 20
+};
+
+inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
+inline uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+	return (x << (N & 31)) | (x >> ((32 - N) & 31));
+}
+
+typedef struct r123array4x32 threefry4x32_ctr_t;
+typedef struct r123array4x32 threefry4x32_key_t;
+typedef struct r123array4x32 threefry4x32_ukey_t;
+
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
+{
+	threefry4x32_ctr_t	X;
+	uint32_t			ks[4 + 1];
+	int					i;
+	ks[4] = 0x1BD11BDA;
+	/*
+	for (i = 0; i < 4; i++)
+	{
+		ks[i] = k.v[i];
+		X.v[i] = in.v[i];
+		ks[4] ^= k.v[i];
+	}*/ 
+	{
+		ks[0] = k.v[0];
+		X.v[0] = in.v[0];
+		ks[4] ^= k.v[0];
+
+		ks[1] = k.v[1];
+		X.v[1] = in.v[1];
+		ks[4] ^= k.v[1];
+
+		ks[2] = k.v[2];
+		X.v[2] = in.v[2];
+		ks[4] ^= k.v[2];
+
+		ks[3] = k.v[3];
+		X.v[3] = in.v[3];
+		ks[4] ^= k.v[3];
+	}
+	X.v[0] += ks[0];
+	X.v[1] += ks[1];
+	X.v[2] += ks[2];
+	X.v[3] += ks[3];
+	if (Nrounds > 0) 
+	{
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 1) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 2) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 3) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 3) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 1;
+	} if (Nrounds > 4) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 5) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 6) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 7) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 7) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 2;
+	} if (Nrounds > 8) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 9) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 10) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 11) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 11) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 3;
+	} if (Nrounds > 12) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 13) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 14) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 15) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 15) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 4;
+	} if (Nrounds > 16) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 17) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 18) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 19) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 19) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 5;
+	} if (Nrounds > 20) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 21) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 22) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 23) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 23) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 6;
+	} if (Nrounds > 24) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 25) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 26) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 27) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 27) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 7;
+	} if (Nrounds > 28) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 29) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 30) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 31) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 31) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 8;
+	} if (Nrounds > 32) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 33) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 34) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 35) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 35) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 9;
+	} if (Nrounds > 36) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 37) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 38) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 39) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 39) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 10;
+	} if (Nrounds > 40) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 41) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 42) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 43) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 43) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 11;
+	} if (Nrounds > 44) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 45) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 46) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 47) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 47) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 12;
+	} if (Nrounds > 48) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 49) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 50) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 51) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 51) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 13;
+	} if (Nrounds > 52) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 53) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 54) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 55) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 55) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 14;
+	} if (Nrounds > 56) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 57) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 58) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 59) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 59) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 15;
+	} if (Nrounds > 60) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 61) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 62) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 63) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 63) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 16;
+	} if (Nrounds > 64) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 65) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 66) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 67) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 67) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 17;
+	} if (Nrounds > 68) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 69) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 70) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 71) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 71) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 18;
+	} 
+	return X;
+} 
+
+template <class T>
+__kernel void PRNG_threefry4x32(
+        __global uint4 *randomnumber,
+        threefry4x32_ctr_t ctr_i,
+        T inf,
+        T sup,
+        T threshold,
+        uint nrounds,
+        uint numrandom
+){
+        size_t  gdx = get_global_id(0);
+
+        uint maxUint = 0;
+        maxUint--;
+        float r = (float)maxUint;
+
+        threefry4x32_ctr_t      ctr = ctr_i; 
+        threefry4x32_ukey_t ukey;
+
+        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+        threefry4x32_ctr_t  random4;
+
+        if ( gdx < numrandom )
+        {
+                random4 = threefry4x32_R(nrounds, ctr, ukey);
+                uint4 frnd;
+				
+                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+				
+                randomnumber[gdx] = frnd;
+        }
+}
+
+
+template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
+
+//end of the looooooong gpu_random_generator kernel 
+
+
+template <class T>
+__kernel void OCL_memset(__global T* buffer, const T value, const int size){
+	int gdx = get_global_id(0);
+	if(gdx < size){
+		buffer[gdx] = value;	
+	}
+}
+
+template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
+template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
+
+__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
+        int gdx = get_global_id(0);
+        if(gdx < size){
+                buffer[gdx] = value;    
+        }
+}
+
+template <class T>
+__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
+    int index=get_global_id(0);
+    data_im = data_im + img_offset;
+    data_col =  data_col + col_offset;
+    if(index < n){
+        int w_out=index %width_col;
+        index /= width_col;
+        int h_out=index%height_col;
+        int channel_in = index/height_col;
+        int channel_out=channel_in *ksize *ksize;
+        int h_in = h_out *stride-pad;
+        int w_in = w_out *stride-pad;
+        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+        data_im +=(channel_in * height + h_in) *width + w_in;
+        int i=0,j=0;
+        for(i=0;i<ksize;++i){
+            for(j=0;j<ksize;++j){
+                int h = h_in+i;
+                int w = w_in+j;
+                if(h >= 0 && w >= 0 && h < height && w < width)
+                    *data_col=data_im[i * width + j];
+                else *data_col=0;
+                data_col +=height_col *width_col;
+            }
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
+template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); 
+
+template <class T>
+__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){
+
+    int index = get_global_id(0);
+
+    data_im = data_im + img_offset;
+    data_col = data_col + col_offset;
+
+    int x_out = index % width_col;
+    int y_out = (index / width_col) % height_col;
+    int channel_in = (index / width_col / height_col) % channels;
+    int channel_out = channel_in * ksize * ksize;
+    int im_id = index / width_col / height_col / channels;
+
+    int y_in = y_out * stride - pad;
+    int x_in = x_out * stride - pad;
+    int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
+    int offset_im = im_id * channels * height * width + channel_in * height * width;
+
+    for(int k_h = 0; k_h < ksize; k_h++){
+        for(int k_w = 0; k_w < ksize; k_w++){
+            int x_im = x_in + k_w;
+            int y_im = y_in + k_h;
+            int index_im = y_im * width + x_im;
+            int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+            if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
+                data_col[offset_col + index_col] = data_im[offset_im + index_im];
+            else
+                data_col[offset_col + index_col] = 0;
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
+template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
+
+
+template <class T>
+__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){
+    int index = get_global_id(0);
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+    if(index < n){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height);
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); 
+template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); 
+
+template <class T>
+__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index;index<n;index+=tmp){
+        int w_out=index %width_col;
+        index /= width_col;
+        int h_out=index%height_col;
+        int channel_in = index/height_col;
+        int channel_out=channel_in *ksize *ksize;
+        int h_in = h_out *stride-pad;
+        int w_in = w_out *stride-pad;
+        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+        data_im +=(channel_in * height + h_in) *width + w_in;
+        int i=0,j=0;
+        for(i=0;i<ksize;++i){
+            for(j=0;j<ksize;++j){
+                int h = h_in+i;
+                int w = w_in+j;
+                if(h >= 0 && w >= 0 && h < height && w < width)
+                    *data_col=data_im[i * width + j];
+                else *data_col=0;
+                data_col += height_col *width_col;
+            }
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); 
+template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); 
+
+template <class T>
+__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){
+    int index = get_global_id(0);
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+    if(index < n){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height) % channels;
+      int im = index / width / height / channels;
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col * optnum);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
+template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
+
+
+template <class T>
+__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < n; index += tmp){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height);
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); 
+template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); 
+
+template <class T>
+__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){
+
+    int index = get_global_id(0);
+    data_opt = data_opt + opt_offset;
+    data_im = data_im + im_offset;
+    if(index < n){
+      int w = index % width;
+      int h = (index / width) % height;
+      int c = index / (width * height) % channels;
+      int im = index / width / height / channels;
+
+      int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
+      data_opt[opt_index] = data_im[index];
+    }
+}
+template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
+template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
+
+
+template <class T>
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global T* top_data){
+     int index = get_global_id(0);
+     int tmp = get_global_size(0);
+     for(index; index < nthreads; index += tmp){
+         int pw = index % pooled_width;
+         int ph = (index / pooled_width) % pooled_height;
+         int c = (index / pooled_width / pooled_height) % channels;
+         int n = index / pooled_width / pooled_height / channels;
+         int hstart = ph * stride;
+         int hend = min(hstart + kernel_size, height);
+         int wstart = pw * stride;
+         int wend = min(wstart + kernel_size, width);
+         T maxval = -99999999;
+         bottom_data += (n * channels + c) * height * width;
+         for (int h = hstart; h < hend; ++h) {
+           for (int w = wstart; w < wend; ++w) {
+             maxval = max(maxval, bottom_data[h * width + w]);
+           }   
+         }
+         top_data[index] = maxval;
+     }
+
+}
+template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* top_data);
+template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, __global double* top_data);
+
+
+template <class T>
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* top_data){
+    int index=get_global_id(0);
+    int tmp=get_global_size(0);
+    for(index;index<nthreads;index+=tmp){
+        int pw = index % pooled_width;
+        int ph = (index / pooled_width) % pooled_height;
+        int c = (index / pooled_width / pooled_height) % channels;
+        int n = index / pooled_width / pooled_height / channels;
+        int hstart = ph * stride - pad;
+        int wstart = pw * stride - pad;
+        int hend = min(hstart + kernel_size, height + pad);
+        int wend = min(wstart + kernel_size, width + pad);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend = min(hend, height);
+        wend = min(wend, width);
+        T aveval = 0;
+        bottom_data += (n * channels + c) * height * width;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            aveval += bottom_data[h * width + w];
+          }
+        }
+        top_data[index] = aveval / pool_size;
+    }
+
+}
+template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* top_data);
+template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, const int pad, __global double* top_data);
+
+template <class T>
+__kernel void MaxPoolBackward(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* top_diff,
+const int num, const int channels, const int height,
+const int width, const int pooled_height, const int pooled_width,
+const int kernel_size, const int stride, __global T* bottom_diff){
+    int index = get_global_id(0);
+    int total = get_global_size(0);
+    for(index; index < nthreads; index += total){
+        // find out the local index
+        // find out the local offset
+        int w = index % width;
+        int h = (index / width) % height;
+        int c = (index / width / height) % channels;
+        int n = index / width / height / channels;
+        int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1;
+        int phend = min(h / stride + 1, pooled_height);
+        int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1;
+        int pwend = min(w / stride + 1, pooled_width);
+        T gradient = 0;
+        T bottom_datum =
+            bottom_data[((n * channels + c) * height + h) * width + w];
+        top_data += (n * channels + c) * pooled_height * pooled_width;
+        top_diff += (n * channels + c) * pooled_height * pooled_width;
+        for (int ph = phstart; ph < phend; ++ph) {
+            for (int pw = pwstart; pw < pwend; ++pw) {
+                gradient += top_diff[ph * pooled_width + pw] *
+                    (bottom_datum == top_data[ph * pooled_width + pw]);
+            }
+        }
+        bottom_diff[index] = gradient;
+
+    }
+
+}
+template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* bottom_diff);
+
+
+template <class T>
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* bottom_diff){
+     int index = get_global_id(0);
+     int total = get_global_size(0);
+     for(index; index < nthreads; index += total){
+	    int w = index % width + pad;
+	    int h = (index / width) % height + pad;
+	    int c = (index / width / height) % channels;
+	    int n = index / width / height / channels;
+	    int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1;
+	    int phend = min(h / stride + 1, pooled_height);
+	    int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1;
+	    int pwend = min(w / stride + 1, pooled_width);
+	    T gradient = 0;
+	    top_diff += (n * channels + c) * pooled_height * pooled_width;
+	    for (int ph = phstart; ph < phend; ++ph) {
+	      for (int pw = pwstart; pw < pwend; ++pw) {
+		// figure out the pooling size
+		int hstart = ph * stride - pad;
+		int wstart = pw * stride - pad;
+		int hend = min(hstart + kernel_size, height + pad);
+		int wend = min(wstart + kernel_size, width + pad);
+		int pool_size = (hend - hstart) * (wend - wstart);
+           gradient += top_diff[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+
+   }
+}
+
+template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, const int pad, __global double* bottom_diff);
+
+template <class T>
+__kernel void ReLUForward(const int count, __global T* in, __global T* out){
+	int index = get_global_id(0);
+	if(index < count)
+		out[index] = in[index] > 0? in[index]:0;
+}
+
+//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out);
+template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out);
+
+template <class T>
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff){
+	int index = get_global_id(0);
+        if(index < count)
+		out_diff[index] = in_diff[index] * (in_data[index] > 0);
+}
+
+template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff);
+template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff);
+
+template <class T>
+__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
+     int index = get_global_id(0);
+     if (index < num) {
+	T maxval = -FLT_MAX;
+        for (int i = 0; i <  dim; i++)
+	maxval = max( data[index*dim + i], maxval );
+        out[index] = maxval;
+      }
+}
+
+template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
+
+template <class T>
+__kernel void exp (const int num, __global T* data, __global T* out){
+        int index = get_global_id(0);
+        if (index < num) 
+        out[index] = exp(data[index]);
+}
+
+template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
+
+template <class T>
+__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){
+        //printf("softmax_div\n");
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        for(index; index < num*dim; index +=  total){
+        int n = index / dim;
+        data[index] /= scale[n];
+        }
+}
+
+template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
+template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data);
+
+template <class T>
+__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){
+    
+    int gid = get_global_id(0);
+    int size = get_global_size(0);
+    
+    resultScratch[gid] = 0.0;
+    for(int i = gid; i < num; i += size){
+    	resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if(gid < 128)
+    	resultScratch[gid] += resultScratch[gid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(gid < 64)
+    	resultScratch[gid] += resultScratch[gid + 64];
+    if(gid < 32)
+    	resultScratch[gid] += resultScratch[gid + 32];
+    if(gid < 16)
+    	resultScratch[gid] += resultScratch[gid + 16];
+    if(gid < 8)
+    	resultScratch[gid] += resultScratch[gid + 8];
+    if(gid < 4)
+    	resultScratch[gid] += resultScratch[gid + 4];
+    if(gid < 2)
+    	resultScratch[gid] += resultScratch[gid + 2];
+    if(gid < 1){
+    	resultScratch[gid] += resultScratch[gid + 1];
+    	loss[0] = resultScratch[gid];
+    }
+
+}
+
+template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
+template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
+
+
+template <class T>
+__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        int offset;
+	for(index; index < num; index +=  total){
+  	offset = (int) label[index];
+        data[index * dim + offset] -= 1;
+        }
+}
+
+template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
+template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
+
+template <class T>
+__kernel void scal (const int num, const T alpha, __global T* data){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        for(index; index < num; index +=  total){
+        data[index] = data[index] * alpha;
+        }
+}
+
+template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
+template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
+
+template <class T>
+__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
+	int index = get_global_id(0);
+        if (index < n)
+        y[index] = a[index] / b[index];
+}
+
+template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
+//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
+
+template <class T>
+__kernel void add_scalar (const int n, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] += alpha;
+}
+
+template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
+
+template <class T>
+__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
+        int index = get_global_id(0);
+       if (index < n)
+        y[index] = a[index] * b[index];
+}
+
+template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
+template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
+
+
+template <class T>
+__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+//           y[index] = a[index] + alpha;
+           y[index] = pow(a[index], alpha);
+}
+
+template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
+template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
+
+template <class T>
+__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){
+    int index = get_global_id(0);
+    if (index < n)
+        out[index] = in[index] * scale * mask[index];
+}
+template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
+template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
+
+
+template <class T>
+__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){
+    int index = get_global_id(0);
+    if (index < n)
+        out_diff[index] = in_diff[index] * scale * mask[index];
+}
+template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
+template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
+
+template <class T>
+__kernel void LRNFillScale(const int nthreads, __global const T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    in += offset;
+    scale += offset;
+    int head = 0;
+    int pre_pad = (size - 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_scale += in[head * step] * in[head * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
+template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global const float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, __global float* scale);
+template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global const double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, __global double* scale);
+
+template <class T>
+__kernel void LRNComputeOutput(const int nthreads, __global const T* in, __global const T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) 
+    out[index] = in[index] * pow(scale[index], negative_beta);
+}
+template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global const float* in, __global const float* scale, const float negative_beta, __global float* out);
+template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global const double* in, __global const double* scale, const double negative_beta, __global double* out);
+
+template <class T>
+__kernel void LRNComputeDiff(const int nthreads, __global const T* bottom_data, __global const T* top_data, __global const T* scale, __global const T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int n = index / width / height;
+    int offset = (n * channels * height + h) * width + w;
+    int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    int pre_pad = size - (size + 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    T accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
+          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+          bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
+}
+
+template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global const float* bottom_data, __global const float* top_data, __global const float* scale, __global const float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global const double* bottom_data, __global const double* top_data, __global const double* scale, __global const double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
+
+template <class T>
+__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int gidy = get_global_id(1);
+     int gidyy = gidy;
+     int index = gidy / height;
+     int offset = index * width * height;
+     gidy = gidy % height;
+     if( gidx < width && gidyy < height * optnum )
+         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+}
+template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
+template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
+
+template <class T>
+__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int index;
+     index = (optnum==1) ? 0: gidx % optnum;
+     dst = dst + top_offset; // now we point at (*top)[n]
+     int offset = gidx / optnum;
+     int i = 0;
+     for(i = 0 ; i < width; i++)
+         dst[(index * height + offset)* width + i] = src[gidx * width + i];
+}
+template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 94fdcc35..4cec89ae 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -85,6 +85,12 @@ const Dtype* Blob<Dtype>::gpu_data() const {
   return (const Dtype*)data_->gpu_data();
 }
 
+template <typename Dtype>
+const Dtype* Blob<Dtype>::gpu_cache_data() const {
+  CHECK(data_);
+  return (const Dtype*)data_->gpu_cache_data();
+}
+
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
   CHECK(diff_);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index e53a5c0d..052281d4 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -11,6 +11,8 @@ shared_ptr<Caffe> Caffe::singleton_;
 
 // random seeding
 int64_t cluster_seedgen(void) {
+ //To fix: for now we use fixed seed to get same result each time
+  /*
   int64_t s, seed, pid;
   FILE* f = fopen("/dev/urandom", "rb");
   if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
@@ -27,6 +29,9 @@ int64_t cluster_seedgen(void) {
   s = time(NULL);
   seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
   return seed;
+  */
+  LOG(WARNING) << "return fixed seed 37";
+  return 37;
 }
 
 
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 26a11182..fa4fe30f 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -78,6 +78,26 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
 }
+template <typename Dtype>
+Dtype DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  // First, join the thread
+  JoinPrefetchThread();
+  // Copy the data from prefetch thread to data_layer
+   //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
+   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_->count(), prefetch_data_->cpu_data(), 0, NULL, NULL) );
+  if (output_labels_) {
+   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_->count(), prefetch_label_->cpu_data(), 0, NULL, NULL) );
+   //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
+   }
+  clFinish(amdDevice.CommandQueue);
+#ifdef Track_data_transfer
+#endif
+  // Start a new prefetch thread
+  DLOG(INFO) << "CreatePrefetchThread";
+  CreatePrefetchThread();
+  return Dtype(0.);
+}
 
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 200ca657..ce11aa03 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -4,9 +4,23 @@
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
 
+#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices
+
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
+if (cpu_ptr_ && own_cpu_data_) {
+    OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL) );
+    clFinish(amdDevice.CommandQueue);
+  }
+  if(gpu_cache_ptr_ && own_cpu_data_)  {
+    OCL_CHECK( clReleaseMemObject((cl_mem)gpu_cache_ptr_) );
+  }
+  if (gpu_ptr_) {
+    OCL_CHECK( clReleaseMemObject((cl_mem)gpu_ptr_) );
+  }
+
+  clReleaseKernel(oclmem_kernel);
 /*  if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_);
   }
@@ -17,51 +31,84 @@ SyncedMemory::~SyncedMemory() {
   }
 #endif  // CPU_ONLY
 */
+}	
+
+void SyncedMemory::ocl_setup() {
+  cl_int err=0;
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+  OCL_CHECK(err);
 }
 
 inline void SyncedMemory::to_cpu() {
- /* switch (head_) {
+switch (head_) {
   case UNINITIALIZED:
-    CaffeMallocHost(&cpu_ptr_, size_);
-    caffe_memset(size_, 0, cpu_ptr_);
+    //allocate pre-pinned memory
+    //pinned_buffer_ptr_
+   // if(data_layer_){
+   // gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_PERSISTENT_MEM_AMD, size_, NULL, NULL);
+   // }
+   // else{
+      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
+    //}
+    cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
+    memset(cpu_ptr_, 0, size_);
     head_ = HEAD_AT_CPU;
     own_cpu_data_ = true;
     break;
-  case HEAD_AT_GPU:
+  case HEAD_AT_GPU:{
 #ifndef CPU_ONLY
     if (cpu_ptr_ == NULL) {
-      CaffeMallocHost(&cpu_ptr_, size_);
+      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
+      cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
       own_cpu_data_ = true;
     }
-    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
+    OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_ptr_, (cl_mem)gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
     head_ = SYNCED;
 #else
     NO_GPU;
+#endif
+#ifdef Track_data_transfer
+    LOG(WARNING) << "sync: data from GPU to CPU";
 #endif
     break;
+  }
   case HEAD_AT_CPU:
   case SYNCED:
     break;
   }
-*/
 }
 
 inline void SyncedMemory::to_gpu() {
-/*
 #ifndef CPU_ONLY
-  switch (head_) {
-  case UNINITIALIZED:
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    caffe_gpu_memset(size_, 0, gpu_ptr_);
+switch (head_) {
+  case UNINITIALIZED:{
+    cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL);
+    if(NULL == tmpMem){
+      fprintf(stderr,"Failed to create memory object\n");
+      break;
+    }
+    ocl_memset(oclmem_kernel, tmpMem, (int)0, (int)(size_/sizeof(int)));
+    gpu_ptr_ = (void*)tmpMem;
     head_ = HEAD_AT_GPU;
     break;
-  case HEAD_AT_CPU:
+  }
+  case HEAD_AT_CPU:{
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+      cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL);
+      if(NULL == tmpMem){
+        fprintf(stderr,"Failed to create memory object\n");
+      }
+      gpu_ptr_ = (void*)tmpMem;
     }
-    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
+    OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, (cl_mem)gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
     head_ = SYNCED;
+#ifdef Track_data_transfer
+    LOG(WARNING) << "sync: data from CPU to GPU";
+#endif
     break;
+  }
   case HEAD_AT_GPU:
   case SYNCED:
     break;
@@ -69,7 +116,6 @@ inline void SyncedMemory::to_gpu() {
 #else
   NO_GPU;
 #endif
-*/
 }
 
 const void* SyncedMemory::cpu_data() {
@@ -78,14 +124,16 @@ const void* SyncedMemory::cpu_data() {
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-  /*CHECK(data);
+CHECK(data);
   if (own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
+  OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL));
+  OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_));
+  clFinish(amdDevice.CommandQueue); //is this necessary?
   }
-  cpu_ptr_ = data;
+  gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_HOST_PTR, size_, data, NULL);
+  cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
   head_ = HEAD_AT_CPU;
   own_cpu_data_ = false;
-*/
 }
 
 const void* SyncedMemory::gpu_data() {

From 8a7c2b25492901d0f7d4a45aac375184fa048c74 Mon Sep 17 00:00:00 2001
From: Yibing <Junli.Gu@amd.com>
Date: Sat, 11 Jul 2015 14:45:32 +0800
Subject: [PATCH 003/124] update data layer for AMD_PERSISTENT_MEM

---
 include/caffe/blob.hpp          | 1 +
 include/caffe/syncedmem.hpp     | 2 +-
 src/caffe/layers/data_layer.cpp | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 160539aa..12854689 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -263,6 +263,7 @@ class Blob {
    * shared_ptr calls its destructor when reset with the "=" operator.
    */
   void ShareDiff(const Blob& other);
+  void set_data_layer(){data_->set_data_layer(); diff_->set_data_layer();};
 
   bool ShapeEquals(const BlobProto& other);
 
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 0bcad1dc..61336d7e 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -42,7 +42,7 @@ class SyncedMemory {
  public:
   SyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), is_data_layer_(false) {
+        own_cpu_data_(false), data_layer_(false) {
         ocl_setup();
         }
   explicit SyncedMemory(size_t size)
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 161a75e0..26eae788 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -48,6 +48,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   top_shape[0] = this->layer_param_.data_param().batch_size();
   this->prefetch_data_.Reshape(top_shape);
   top[0]->ReshapeLike(this->prefetch_data_);
+  prefetch_data_->set_data_layer();
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -57,6 +58,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
     top[1]->Reshape(label_shape);
     this->prefetch_label_.Reshape(label_shape);
+    prefetch_label_->set_data_layer();
   }
 }
 

From 13cd87f9f0b6366dde8fee8fb2d31648fed93872 Mon Sep 17 00:00:00 2001
From: Yibing <yibing.liu@amd.com>
Date: Sun, 12 Jul 2015 14:30:12 +0800
Subject: [PATCH 004/124] add Forward_gpu and Backward_gpu for more layers;
 update math functions, Makefile and im2col.cpp

---
 Makefile                                      |  29 +++++
 include/caffe/util/math_functions.hpp         |  14 +--
 src/caffe/layers/absval_layer.cpp             |  10 ++
 src/caffe/layers/base_data_layer.cpp          |   7 ++
 src/caffe/layers/bnll_layer.cpp               |  10 ++
 src/caffe/layers/concat_layer.cpp             |  10 ++
 src/caffe/layers/contrastive_loss_layer.cpp   |  10 ++
 src/caffe/layers/deconv_layer.cpp             |  12 ++
 src/caffe/layers/eltwise_layer.cpp            |  12 ++
 src/caffe/layers/euclidean_loss_layer.cpp     |  12 ++
 src/caffe/layers/exp_layer.cpp                |  10 ++
 src/caffe/layers/filter_layer.cpp             |  11 ++
 src/caffe/layers/hdf5_data_layer.cpp          |   6 +
 src/caffe/layers/hdf5_output_layer.cpp        |  11 ++
 src/caffe/layers/im2col_layer.cpp             |  10 ++
 src/caffe/layers/inner_product_layer.cpp      |  10 ++
 src/caffe/layers/log_layer.cpp                |  12 ++
 src/caffe/layers/lrn_layer.cpp                |  20 ++++
 src/caffe/layers/mvn_layer.cpp                |  10 ++
 src/caffe/layers/power_layer.cpp              |  10 ++
 src/caffe/layers/prelu_layer.cpp              |  11 ++
 src/caffe/layers/reduction_layer.cpp          |  10 ++
 .../sigmoid_cross_entropy_loss_layer.cpp      |   5 +
 src/caffe/layers/sigmoid_layer.cpp            |  10 ++
 src/caffe/layers/silence_layer.cpp            |  11 ++
 src/caffe/layers/slice_layer.cpp              |  10 ++
 src/caffe/layers/softmax_layer.cpp            |  11 ++
 src/caffe/layers/softmax_loss_layer.cpp       |  12 ++
 src/caffe/layers/split_layer.cpp              |   9 ++
 src/caffe/layers/tanh_layer.cpp               |  12 ++
 src/caffe/layers/threshold_layer.cpp          |   5 +
 src/caffe/util/im2col.cpp                     |  39 ++++++
 src/caffe/util/math_functions.cpp             | 111 +++++++++++++++++-
 33 files changed, 480 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 80c5642d..f0ac9e06 100644
--- a/Makefile
+++ b/Makefile
@@ -163,11 +163,40 @@ ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
 endif
 CUDA_LIB_DIR += $(CUDA_DIR)/lib
 
+#################################
+# OpenCL include and library 
+#################################
+OCL_INCLUDE_DIR := $(OCL_DIR)/include
+CLBLAS_INCLUDE_DIR := ${CLBLAS_DIR}/include
+
+OCL_LIB_DIR := 
+CLBLAS_LIB_DIR :=
+# add <OCL>/lib/x86_64 only if it exists
+ifneq ("$(wildcard $(OCL_LIB_DIR)/lib/x86_64)","")
+        OCL_LIB_DIR += $(OCL_DIR)/lib/x86_64
+endif
+OCL_LIB_DIR += $(OCL_DIR)/lib/x86
+
+# add <CLBLAS_DIR>/lib/ only if it exists
+ifneq ("$(wildcard $(CLBLAS_DIR)/lib)","")
+        CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib
+endif
+
+# add <CLBLAS_DIR>/lib64/ only if it exists
+ifneq ("$(wildcard $(CLBLAS_DIR)/lib64)","")
+        CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib64
+endif
+
 INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
 ifneq ($(CPU_ONLY), 1)
 	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES := cudart cublas curand
+        
+        INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR)
+        LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR)
+        LIBRARIES += OpenCL clBLAS
+
 endif
 LIBRARIES += glog gflags protobuf leveldb snappy \
 	lmdb boost_system hdf5_hl hdf5 m \
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index bcafeb89..2cbbf1f0 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -241,27 +241,19 @@ inline char caffe_sign(Dtype val) {
   template <> \
   void caffe_cpu_##name<double>(const int n, const double* x, double* y)
 
-/*
+
 #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
 template<typename Dtype> \
-__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
-  CUDA_KERNEL_LOOP(index, n) { \
+void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
     operation; \
-  } \
 } \
 template <> \
 void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
-   NOLINT_NEXT_LINE(whitespace/operators)  \
-  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
 } \
 template <> \
 void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
-   NOLINT_NEXT_LINE(whitespace/operators)  \
-  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
 }
-*/
+
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
 DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 5ce28c9e..30422737 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -35,6 +35,16 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(AbsValLayer);
 #endif
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index fa4fe30f..917059b8 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -99,6 +99,13 @@ Dtype DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   return Dtype(0.);
 }
 
+template <typename Dtype>
+void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
 #endif
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 9ba0ea9a..09e2bc89 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -38,6 +38,16 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(BNLLLayer);
 #endif
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 1cac8fc3..6af287a9 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -88,6 +88,16 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(ConcatLayer);
 #endif
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 25e16781..aad4cab3 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -111,6 +111,16 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ContrastiveLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(ContrastiveLossLayer);
 #endif
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index a4612963..e8937238 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -69,6 +69,18 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(DeconvolutionLayer);
 #endif
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index a8070073..cffc743d 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -151,6 +151,18 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(EltwiseLayer);
 #endif
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 80efa31b..9c37c18b 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -47,6 +47,18 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(EuclideanLossLayer);
 #endif
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index c7e7c60c..547fca6a 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -59,6 +59,16 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(ExpLayer);
 #endif
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index be1db32d..4d004ad4 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -117,6 +117,17 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
 #ifdef CPU_ONLY
 STUB_GPU(FilterLayer);
 #endif
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 8a782f7e..649dc020 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -157,6 +157,12 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
+template <typename Dtype>
+void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(HDF5DataLayer, Forward);
 #endif
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f63375c3..7d1ca097 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -67,6 +67,17 @@ void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return;
 }
 
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  return;
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(HDF5OutputLayer);
 #endif
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 1c802714..ddf6c989 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -85,6 +85,16 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(Im2colLayer);
 #endif
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 83c3235e..4d25215a 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -119,6 +119,16 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(InnerProductLayer);
 #endif
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 55a227f6..9d3977a7 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -77,6 +77,18 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
+template <typename Dtype>
+void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(LogLayer);
 #endif
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 36c1ace4..47fa5ed5 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -247,6 +247,26 @@ void LRNLayer<Dtype>::WithinChannelBackward(
   }
 }
 
+template <typename Dtype>
+void LRNLayer<Dtype>::CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 3e79bddc..84701831 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -134,6 +134,16 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 
 #ifdef CPU_ONLY
 STUB_GPU(MVNLayer);
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 4fe34c49..bc14fffb 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -94,6 +94,16 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(PowerLayer);
 #endif
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 81831755..4db0dc7c 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -129,6 +129,17 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
 
 #ifdef CPU_ONLY
 STUB_GPU(PReLULayer);
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 8ae6329e..c4a8b4e0 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -122,6 +122,16 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(ReductionLayer);
 #endif
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index cc236fe1..1a4329da 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -70,6 +70,11 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   }
 }
 
+template <typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
 #endif
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 48c38490..30ad9b0b 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -39,6 +39,16 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(SigmoidLayer);
 #endif
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index 4abf9eff..ecd12d12 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -17,6 +17,17 @@ void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  // Do nothing.
+}
+
+template <typename Dtype>
+void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(SilenceLayer);
 #endif
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index e4418c9c..76021faa 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -110,6 +110,16 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU(SliceLayer);
 #endif
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 04712c9e..488e836a 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -86,6 +86,17 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
 
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxLayer);
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index ba312f67..6380f264 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -120,6 +120,18 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 272cb59c..932b240b 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -49,6 +49,15 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
 
 #ifdef CPU_ONLY
 STUB_GPU(SplitLayer);
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index ee5ed773..abc09bbc 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -37,6 +37,18 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+template <typename Dtype>
+void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+}
+
+template <typename Dtype>
+void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+}
+
+
+
 #ifdef CPU_ONLY
 STUB_GPU(TanHLayer);
 #endif
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 2365e7b9..345fd6b7 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -24,6 +24,11 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
+template <typename Dtype>
+void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top){
+}
+
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(ThresholdLayer, Forward);
 #endif
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index c48f31f3..6545d98c 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -80,4 +80,43 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
+
+
+template <typename Dtype>
+void im2col_gpu(const Dtype* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col) {
+}
+
+
+// Explicit instantiation
+template void im2col_gpu<float>(const float* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_col);
+template void im2col_gpu<double>(const double* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_col);
+
+
+template <typename Dtype>
+void col2im_gpu(const Dtype* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, Dtype* data_im) {
+}
+
+// Explicit instantiation
+template void col2im_gpu<float>(const float* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_im);
+template void col2im_gpu<double>(const double* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_im);
+
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 6cbf208d..364fbe11 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -450,7 +450,6 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
-  //LOG(INFO) << "caffe_rng_bernoulli";
 }
 
 template
@@ -458,6 +457,26 @@ void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
 
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = static_cast<unsigned int>(variate_generator());
+  }
+}
+
+template
+void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+
+template
+void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
 //
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
@@ -523,6 +542,10 @@ template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 }
 
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
+                                      - (x[index] < Dtype(0)));
+DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
+
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
@@ -666,4 +689,90 @@ template <>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
                             double* r) {
 }
+
+template <>
+void caffe_log<float>(const int n, const float* a, float* y) {
+  vsLn(n, a, y);
+}
+
+template <>
+void caffe_log<double>(const int n, const double* a, double* y) {
+  vdLn(n, a, y);
+}
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+    if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+      // NOLINT_NEXT_LINE(caffe/alt_fn)
+      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+#else
+      NO_GPU;
+#endif
+    } else {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    }
+  }
+}
+
+template void caffe_copy<int>(const int N, const int* X, int* Y);
+template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
+    unsigned int* Y);
+template void caffe_copy<float>(const int N, const float* X, float* Y);
+template void caffe_copy<double>(const int N, const double* X, double* Y);
+
+template <>
+void caffe_abs<float>(const int n, const float* a, float* y) {
+    vsAbs(n, a, y);
+}
+
+template <>
+void caffe_abs<double>(const int n, const double* a, double* y) {
+    vdAbs(n, a, y);
+}
+
+template <>
+void caffe_gpu_add<float>(const int N, const float* a, const float* b,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+ // add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+   //   N, a, b, y);
+}
+
+template <>
+void caffe_gpu_add<double>(const int N, const double* a, const double* b,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+ // add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+   //   N, a, b, y);
+}
+
+template <>
+float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
+}
+
+template <>
+double caffe_cpu_strided_dot<double>(const int n, const double* x,
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
+}
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template void caffe_set<int>(const int N, const int alpha, int* Y);
+template void caffe_set<float>(const int N, const float alpha, float* Y);
+template void caffe_set<double>(const int N, const double alpha, double* Y);
+
 }  // namespace caffe

From 8e0713542041d908a1dfda85e2aa95b07532f162 Mon Sep 17 00:00:00 2001
From: Junli <Junli.gu@amd.com>
Date: Sun, 12 Jul 2015 15:05:21 +0800
Subject: [PATCH 005/124] Minor update ot syncedmem.hpp

---
 include/caffe/syncedmem.hpp       | 2 +-
 src/caffe/util/math_functions.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 61336d7e..2cb316fb 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -55,7 +55,7 @@ class SyncedMemory {
   const void* cpu_data();
   void set_cpu_data(void* data);
   const void* gpu_data();
-  //const void* gpu_cache_data();
+  const void* gpu_cache_data();
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 364fbe11..17c2b414 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -775,4 +775,5 @@ template void caffe_set<int>(const int N, const int alpha, int* Y);
 template void caffe_set<float>(const int N, const float alpha, float* Y);
 template void caffe_set<double>(const int N, const double alpha, double* Y);
 
+
 }  // namespace caffe

From 622a9bced3a5418864c5348e7c8cc80a24746519 Mon Sep 17 00:00:00 2001
From: Yibing <yibing.liu@amd.com>
Date: Mon, 13 Jul 2015 12:18:39 +0800
Subject: [PATCH 006/124] This patch debugged data layer, added ocl/util, etc.
 made run cpu alexnet

---
 examples/imagenet/train_alexnet.sh      |  4 ++
 examples/imagenet/train_alexnet_cpu.sh  |  4 ++
 examples/imagenet/train_caffenet_cpu.sh |  4 ++
 include/caffe/util/ocl_util.hpp         | 16 ++++++
 models/bvlc_alexnet/solver.prototxt     |  4 +-
 models/bvlc_alexnet/train_val.prototxt  |  8 +--
 src/caffe/device.cpp                    |  2 +-
 src/caffe/layers/base_data_layer.cpp    | 44 +++++++++++++---
 src/caffe/layers/data_layer.cpp         |  4 +-
 src/caffe/solver.cpp                    |  9 ++++
 src/caffe/syncedmem.cpp                 |  5 +-
 src/caffe/util/ocl_util.cpp             | 68 +++++++++++++++++++++++++
 12 files changed, 154 insertions(+), 18 deletions(-)
 create mode 100755 examples/imagenet/train_alexnet.sh
 create mode 100755 examples/imagenet/train_alexnet_cpu.sh
 create mode 100755 examples/imagenet/train_caffenet_cpu.sh
 create mode 100644 include/caffe/util/ocl_util.hpp
 create mode 100644 src/caffe/util/ocl_util.cpp

diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh
new file mode 100755
index 00000000..98c05c59
--- /dev/null
+++ b/examples/imagenet/train_alexnet.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver.prototxt
diff --git a/examples/imagenet/train_alexnet_cpu.sh b/examples/imagenet/train_alexnet_cpu.sh
new file mode 100755
index 00000000..a86f75fe
--- /dev/null
+++ b/examples/imagenet/train_alexnet_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_cpu.prototxt
diff --git a/examples/imagenet/train_caffenet_cpu.sh b/examples/imagenet/train_caffenet_cpu.sh
new file mode 100755
index 00000000..4bcebf36
--- /dev/null
+++ b/examples/imagenet/train_caffenet_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train \
+    --solver=models/bvlc_reference_caffenet/solver_cpu.prototxt
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
new file mode 100644
index 00000000..55695070
--- /dev/null
+++ b/include/caffe/util/ocl_util.hpp
@@ -0,0 +1,16 @@
+// Copyright 2014 AMD DNN contributors.
+
+#ifndef _CAFFE_UTIL_OCL_UTIL_HPP_
+#define _CAFFE_UTIL_OCL_UTIL_HPP_
+
+namespace caffe {
+
+template <typename Dtype>
+void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count);
+
+void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count);
+
+void eventCallback(cl_event event, cl_int event_status, void * user_data);
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/models/bvlc_alexnet/solver.prototxt b/models/bvlc_alexnet/solver.prototxt
index 129265e6..6f23e9d1 100644
--- a/models/bvlc_alexnet/solver.prototxt
+++ b/models/bvlc_alexnet/solver.prototxt
@@ -1,11 +1,11 @@
 net: "models/bvlc_alexnet/train_val.prototxt"
-test_iter: 1000
+test_iter: 1
 test_interval: 1000
 base_lr: 0.01
 lr_policy: "step"
 gamma: 0.1
 stepsize: 100000
-display: 20
+display: 1
 max_iter: 450000
 momentum: 0.9
 weight_decay: 0.0005
diff --git a/models/bvlc_alexnet/train_val.prototxt b/models/bvlc_alexnet/train_val.prototxt
index 588b4ea7..1f9654be 100644
--- a/models/bvlc_alexnet/train_val.prototxt
+++ b/models/bvlc_alexnet/train_val.prototxt
@@ -10,10 +10,10 @@ layer {
   transform_param {
     mirror: true
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
     batch_size: 256
     backend: LMDB
   }
@@ -29,10 +29,10 @@ layer {
   transform_param {
     mirror: false
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
     batch_size: 50
     backend: LMDB
   }
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 7c564589..bce26316 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -127,7 +127,7 @@ cl_int Device::Init(){
     }
 
     //Read our own kernel file
-    const char *pFileName = "../../src/caffe/OCL_kernel.cl";
+    const char *pFileName = "./src/caffe/OCL_kernel.cl";
     const char *pSource;
     std::string strSource = "";
     ConvertToString(pFileName, strSource);
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 917059b8..7169d3fd 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -74,35 +74,63 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
                top[1]->mutable_cpu_data());
   }
+
+  //sample <=20 data from top_data and display
+  const Dtype *top_cpu_data = (top)[0]->cpu_data();
+  size_t top_cpu_data_count = (top)[0]->count();
+  size_t sample_interval = top_cpu_data_count/20;
+  if(sample_interval == 0){
+     sample_interval=1;
+  }
+  for(int i=0; i<top_cpu_data_count; i+=sample_interval){
+      printf("%f\t", top_cpu_data[i]);
+  }
+  printf("\n\n");
+
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
+
 }
+
 template <typename Dtype>
-Dtype DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      vector<Blob<Dtype>*>* top) {
+void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+     const  vector<Blob<Dtype>*>& top) {
   // First, join the thread
   JoinPrefetchThread();
   // Copy the data from prefetch thread to data_layer
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
-   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_->count(), prefetch_data_->cpu_data(), 0, NULL, NULL) );
-  if (output_labels_) {
-   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)(*top)[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_->count(), prefetch_label_->cpu_data(), 0, NULL, NULL) );
+   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
+  if (this->output_labels_) {
+   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
    }
   clFinish(amdDevice.CommandQueue);
 #ifdef Track_data_transfer
 #endif
+
+//sample <=20 data from top_data and display
+  const  Dtype *top_cpu_data = (top)[0]->cpu_data();
+  size_t top_cpu_data_count = (top)[0]->count();
+  size_t sample_interval = top_cpu_data_count/20;
+  if(sample_interval == 0){ 
+     sample_interval=1;
+  }
+  for(int i=0; i<top_cpu_data_count; i+=sample_interval){
+      printf("%f\t", top_cpu_data[i]);
+  }
+  printf("\n\n");
+
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
-  return Dtype(0.);
+  //return Dtype(0.);
 }
 
-template <typename Dtype>
+/*template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
-}
+}*/
 
 
 
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 26eae788..8ac9b8ee 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -48,7 +48,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   top_shape[0] = this->layer_param_.data_param().batch_size();
   this->prefetch_data_.Reshape(top_shape);
   top[0]->ReshapeLike(this->prefetch_data_);
-  prefetch_data_->set_data_layer();
+  this->prefetch_data_.set_data_layer();
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -58,7 +58,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
     top[1]->Reshape(label_shape);
     this->prefetch_label_.Reshape(label_shape);
-    prefetch_label_->set_data_layer();
+    this->prefetch_label_.set_data_layer();
   }
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index aabe0ede..bbac8fb5 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -33,6 +33,14 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
             << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+
+//#ifndef CPU_ONLY
+  //AMD device related initialization
+  amdDevice.Init();
+//#else
+//  NO_GPU;
+//#endif
+
   if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
@@ -42,6 +50,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   LOG(INFO) << "Solver scaffolding done.";
   iter_ = 0;
   current_step_ = 0;
+
 }
 
 template <typename Dtype>
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index ce11aa03..e98e6847 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -3,6 +3,7 @@
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/util/ocl_util.hpp"
 
 #define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices
 
@@ -161,6 +162,8 @@ void* SyncedMemory::mutable_gpu_data() {
 #endif
 }
 
-
+const void *SyncedMemory::gpu_cache_data()
+{
+}
 }  // namespace caffe
 
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
new file mode 100644
index 00000000..8feead82
--- /dev/null
+++ b/src/caffe/util/ocl_util.cpp
@@ -0,0 +1,68 @@
+// Copyright 2014 AMD DNN contributors.
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <stdlib.h>
+#include <stdio.h>
+#include "caffe/common.hpp"
+#include "caffe/util/ocl_util.hpp"
+namespace caffe {
+
+
+template <typename Dtype>
+void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){
+    cl_int err=0;
+    //cl_kernel Kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", &err);
+    //if(NULL==Kernel){
+    //    fprintf(stderr, "Failed to create kernel %d\n", err);
+    //}   
+ 
+    err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
+    err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value);
+    err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
+    OCL_CHECK(err);
+ 
+    size_t Global_Work_Size[1] = {count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+
+// Explicit instantiation
+template void ocl_memset<float>(cl_kernel Kernel, float* buffer, const float value, const int count);
+template void ocl_memset<double>(cl_kernel Kernel, double* buffer, const double value, const int count);
+
+
+void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){
+   cl_int err=0;
+  //  cl_kernel Kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+   // if(NULL==Kernel){
+   //     fprintf(stderr, "Failed to create kernel %d\n", err);
+   // }
+
+    err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
+    err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value);
+    err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
+    OCL_CHECK(err);
+
+    size_t Global_Work_Size[] = {count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+
+void eventCallback(cl_event event, cl_int event_status, void* user_data){
+    printf("The calling\n");
+    int err = 0;
+    cl_ulong ev_start_time = (cl_ulong)0;
+    cl_ulong ev_end_time = (cl_ulong)0;
+    double run_time;
+    err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL);
+    err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
+    run_time = (double)(ev_end_time - ev_start_time);
+    printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
+}
+
+
+}  // namespace caffe

From 1a45bf189b4555a3ab2246e5bd07dd7f1445a018 Mon Sep 17 00:00:00 2001
From: Yibing <yibing.liu@amd.com>
Date: Thu, 16 Jul 2015 02:53:25 +0800
Subject: [PATCH 007/124] Conv layer FP and BP logic ported. Baseline scheme

---
 examples/imagenet/train_alexnet.sh            |   2 +-
 .../imagenet/train_alexnet_without_dropout.sh |   4 +
 .../train_alexnet_without_dropout_cpu.sh      |   4 +
 include/caffe/common.hpp                      |  17 +
 include/caffe/util/im2col.hpp                 |  35 +
 include/caffe/util/ocl_wrapper.hpp            |  67 ++
 include/caffe/vision_layers.hpp               |  29 +-
 src/caffe/OCL_kernel.cl                       |  11 +
 src/caffe/common.cpp                          |  15 +-
 src/caffe/data_transformer.cpp                |   2 +
 src/caffe/layers/base_conv_layer.cpp          | 107 ++-
 src/caffe/layers/base_data_layer.cpp          |  32 +-
 src/caffe/layers/conv_layer.cpp               |  67 +-
 src/caffe/layers/dropout_layer.cpp            |   2 +
 src/caffe/layers/inner_product_layer.cpp      |   2 +
 src/caffe/layers/lrn_layer.cpp                |   4 +
 src/caffe/layers/pooling_layer.cpp            |   2 +
 src/caffe/layers/relu_layer.cpp               |   2 +
 src/caffe/layers/softmax_layer.cpp            |   2 +
 src/caffe/layers/softmax_loss_layer.cpp       |   2 +
 src/caffe/layers/split_layer.cpp              |   2 +
 src/caffe/net.cpp                             | 852 ------------------
 src/caffe/solver.cpp                          |  29 +-
 src/caffe/util/benchmark.cpp                  |   8 +-
 src/caffe/util/im2col.cpp                     | 246 ++++-
 src/caffe/util/math_functions.cpp             |  18 +-
 src/caffe/util/ocl_wrapper.cpp                | 447 +++++++++
 27 files changed, 1102 insertions(+), 908 deletions(-)
 create mode 100755 examples/imagenet/train_alexnet_without_dropout.sh
 create mode 100755 examples/imagenet/train_alexnet_without_dropout_cpu.sh
 create mode 100644 include/caffe/util/ocl_wrapper.hpp
 delete mode 100644 src/caffe/net.cpp
 create mode 100644 src/caffe/util/ocl_wrapper.cpp

diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh
index 98c05c59..e62279e2 100755
--- a/examples/imagenet/train_alexnet.sh
+++ b/examples/imagenet/train_alexnet.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env sh
 
-./build/tools/caffe train \
+GLOG_logtostderr=1 ./build/tools/caffe train \
     --solver=models/bvlc_alexnet/solver.prototxt
diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh
new file mode 100755
index 00000000..5f3d3326
--- /dev/null
+++ b/examples/imagenet/train_alexnet_without_dropout.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=1 ./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_without_dropout.prototxt
diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
new file mode 100755
index 00000000..15625f8a
--- /dev/null
+++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=1 ./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index b1528474..e0703056 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -21,6 +21,7 @@
 
 #include "caffe/device.hpp"
 #include "caffe/util/device_alternate.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
 // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
@@ -88,6 +89,22 @@ private:\
      } \
  } while(0)
 
+//sample #num data from Blob_
+#define CHECK_BLOB_DATA(Blob_, num, marker) \
+do{ \
+  const  Dtype *top_cpu_data = Blob_->cpu_data(); \
+  size_t top_cpu_data_count = Blob_->count(); \
+  size_t sample_interval = top_cpu_data_count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<top_cpu_data_count; i+=sample_interval){ \
+      printf("%f  ", top_cpu_data[i]); \
+  } \
+  printf("\n\n"); \
+}while(0)
+
 // See PR #1236
 namespace cv { class Mat; }
 
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 0051e2fa..066eb2fc 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -15,6 +15,7 @@ void col2im_cpu(const Dtype* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
+
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -27,6 +28,40 @@ void col2im_gpu(const Dtype* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
+template <typename Dtype>
+void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset);
+
+template <typename Dtype>
+void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset);
+
+template <typename Dtype>
+void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset, const int optnum);
+
+template <typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, Dtype* data_im, const int img_offset);
+
+template <typename Dtype>
+void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, const int img_offset, const int optnum);
+
+template <typename Dtype>
+void col2im_gpu_ocl(cl_mem data_col, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, cl_kernel Kernel);
+
+template <typename Dtype>
+void im2col_gpu_ocl(cl_mem data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, cl_kernel Kernel);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
new file mode 100644
index 00000000..df9e855e
--- /dev/null
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -0,0 +1,67 @@
+// Copyright 2014 AMD DNN contributors.
+
+#ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_
+#define _CAFFE_UTIL_OCL_WRAPPER_HPP_
+
+namespace caffe {
+
+template <typename Dtype>
+void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+
+template <typename Dtype>
+void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
+    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum);
+
+template <typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data);
+
+template <typename Dtype>
+void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data);
+
+template <typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
+
+template <typename Dtype>
+void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data);
+
+template <typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const Dtype* label);
+
+template <typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data);
+
+template <typename Dtype>
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data);
+
+template <typename Dtype>
+void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff );
+
+template <typename Dtype>
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
+
+template <typename Dtype>
+void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
+
+template <typename Dtype>
+void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
+
+template <typename Dtype>
+void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y );
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index a6bd86a9..21c72bba 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -25,6 +25,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
  public:
   explicit BaseConvolutionLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
+  virtual  ~BaseConvolutionLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -46,6 +47,8 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
       weights);
   void backward_cpu_bias(Dtype* bias, const Dtype* input);
+//opencl related setup
+  void ocl_setup();
 
 #ifndef CPU_ONLY
   void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
@@ -88,12 +91,16 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   }
 #ifndef CPU_ONLY
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+//    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+//        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+      im2col_gpu(im2col_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, 
+                conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0);
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+   // col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+   //     kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+      col2im_gpu(col2im_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_,
+                 kernel_h_, pad_h_, stride_h_, data, bottom_offset_);
   }
 #endif
 
@@ -109,6 +116,20 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 
   Blob<Dtype> col_buffer_;
   Blob<Dtype> bias_multiplier_;
+
+//opencl related data structures
+protected:
+  cl_kernel im2col_kernel, col2im_kernel;
+  cl_kernel oclmem_kernel;
+  cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat;
+  cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
+  cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel;
+public:
+  static cl_mem subTopMem, transMem;
+  static size_t subtop_mem_size, trans_mem_size;
+
+public:
+  size_t top_offset_, bottom_offset_;
 };
 
 /**
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 980dc37c..d132efe8 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -743,6 +743,17 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size)
         }
 }
 
+template <class T>
+__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
+     int gdx = get_global_id(0);
+     if(gdx < N){
+          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
+     }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
+
 template <class T>
 __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
     int index=get_global_id(0);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 052281d4..407668c9 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -47,9 +47,11 @@ void GlobalInit(int* pargc, char*** pargv) {
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
 Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU) { }
+    : random_generator_(), mode_(Caffe::CPU) {
+ }
 
-Caffe::~Caffe() { }
+Caffe::~Caffe() { 
+}
 
 void Caffe::set_random_seed(const unsigned int seed) {
   // RNG seed
@@ -106,6 +108,14 @@ Caffe::Caffe()
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
 */
+   cl_int err =  clblasSetup();
+   if(err != CL_SUCCESS){
+       LOG(ERROR) << "clBLAS setup failed "<<err;
+   }
+   else
+   {
+      printf("clBLAS setup succeed!\n");
+   }
 }
 
 Caffe::~Caffe() {
@@ -114,6 +124,7 @@ Caffe::~Caffe() {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
 */
+   clblasTeardown();
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 22633922..2a3bc645 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -24,6 +24,7 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
     ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
     data_mean_.FromProto(blob_proto);
   }
+  printf("before if\n");
   // check if we want to use mean_value
   if (param_.mean_value_size() > 0) {
     CHECK(param_.has_mean_file() == false) <<
@@ -32,6 +33,7 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
       mean_values_.push_back(param_.mean_value(c));
     }
   }
+  printf("reaches here\n");
 }
 
 template<typename Dtype>
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ccb3adc7..38d8952d 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -8,6 +8,63 @@
 
 namespace caffe {
 
+#ifdef use_packing_scheme
+template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::subtop_mem_size = sizeof(Dtype);
+template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::trans_mem_size =  sizeof(Dtype);
+template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::subtop_mem_size, NULL, NULL);
+template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
+#endif
+
+template <typename Dtype>
+void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
+{
+  if(subtop_size > BaseConvolutionLayer<Dtype>::subtop_mem_size){
+      ConvolutionLayer<Dtype>::subtop_mem_size = subtop_size;
+      clReleaseMemObject(ConvolutionLayer<Dtype>::subTopMem);
+      ConvolutionLayer<Dtype>::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::subtop_mem_size, NULL, NULL);
+  }
+  if(trans_size > ConvolutionLayer<Dtype>::trans_mem_size){
+      ConvolutionLayer<Dtype>::trans_mem_size =  trans_size;
+      clReleaseMemObject(ConvolutionLayer<Dtype>::transMem);
+      ConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
+  }
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::ocl_setup() {
+  im2col_kernel = clCreateKernel(amdDevice.Program,"im2colfloat", NULL);
+  col2im_kernel = clCreateKernel(amdDevice.Program,"col2imfloat", NULL);
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
+  im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
+  col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL);
+  opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL);
+  ocl_Kernel_im2colfloat = clCreateKernel(amdDevice.Program,"im2colfloat_yuan",NULL);
+  ocl_Kernel_col2imfloat = clCreateKernel(amdDevice.Program,"col2imfloat_yuan",NULL);
+  ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL);
+  ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL);
+
+#ifdef use_packing_scheme
+  size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
+  size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
+  Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
+#endif
+}
+
+
+template <typename Dtype>
+ BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
+  OCL_CHECK( clReleaseKernel(im2col_kernel) );
+  OCL_CHECK( clReleaseKernel(col2im_kernel) );
+  OCL_CHECK( clReleaseKernel(oclmem_kernel) );
+  OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) );
+  OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
+  OCL_CHECK( clReleaseKernel(ocl_Kernel_im2colfloat) );
+  OCL_CHECK( clReleaseKernel(ocl_Kernel_col2imfloat) );
+  OCL_CHECK( clReleaseKernel(im2col_opt_kernel) );
+  OCL_CHECK( clReleaseKernel(col2im_opt_kernel) );
+}
+
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -68,6 +125,10 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     conv_out_channels_ = num_output_;
     conv_in_channels_ = channels_;
   }
+
+  //initializa OpenCL kernels and cl_mem objects
+    ocl_setup();
+
   // Handle the parameters: weights and biases.
   // - blobs_[0] holds the filter weights
   // - blobs_[1] holds the biases (optional)
@@ -234,20 +295,31 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
     }
     col_buff = col_buffer_.gpu_data();
   }
+  
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
+    /*caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
         group_, conv_out_spatial_dim_, kernel_dim_ / group_,
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)0., output + output_offset_ * g);
-  }
+    */
+    //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count());   
+    caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
+          conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+        (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
+        (Dtype)0., output,  top_offset_+output_offset_ * g);
+   }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
     const Dtype* bias) {
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+  /*caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
       height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
-      (Dtype)1., output);
+      (Dtype)1., output);*/
+     caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+          height_out_*width_out_, 1, (Dtype)1., bias, 0,
+          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+          (Dtype)1., output, top_offset_);
 }
 
 template <typename Dtype>
@@ -258,13 +330,18 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
+   /* caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
         conv_out_spatial_dim_, conv_out_channels_ / group_,
         (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
         (Dtype)0., col_buff + col_offset_ * g);
+  */
+        caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
+          (Dtype)1., weights,  weight_offset_ * g,
+          output, top_offset_+output_offset_ * g,
+          (Dtype)0., col_buff, col_offset_ * g);
   }
   if (!is_1x1_) {
-    conv_col2im_gpu(col_buff, input);
+      conv_col2im_gpu(col_buff, input);
   }
 }
 
@@ -277,18 +354,26 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
     col_buff = col_buffer_.gpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
+   /* caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
         kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);
-  }
+        (Dtype)1., weights + weight_offset_ * g);*/
+      caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_,
+        (Dtype)1., output, top_offset_,
+        (Dtype*)col_buff, col_offset_ * g, (Dtype)1.,
+        (Dtype*)weights, weight_offset_ * g);
+ }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.gpu_data(), 1., bias);
+ /* caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
+      input, bias_multiplier_.gpu_data(), 1., bias);*/
+      caffe_gpu_gemvv<Dtype>(CblasNoTrans, num_output_, height_out_*width_out_,
+          (Dtype)1., input, top_offset_, height_out_*width_out_,
+          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
+          bias, (size_t)0, 1);
 }
 
 #endif  // !CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 7169d3fd..b768f05f 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -75,17 +75,7 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
                top[1]->mutable_cpu_data());
   }
 
-  //sample <=20 data from top_data and display
-  const Dtype *top_cpu_data = (top)[0]->cpu_data();
-  size_t top_cpu_data_count = (top)[0]->count();
-  size_t sample_interval = top_cpu_data_count/20;
-  if(sample_interval == 0){
-     sample_interval=1;
-  }
-  for(int i=0; i<top_cpu_data_count; i+=sample_interval){
-      printf("%f\t", top_cpu_data[i]);
-  }
-  printf("\n\n");
+  CHECK_BLOB_DATA(top[0], 20, "top[0]");
 
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
@@ -100,26 +90,20 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
   JoinPrefetchThread();
   // Copy the data from prefetch thread to data_layer
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
-   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
+   top[0]->ReshapeLike(prefetch_data_);
+    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
   if (this->output_labels_) {
+       // Reshape to loaded labels.
+    top[1]->ReshapeLike(prefetch_label_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
    }
   clFinish(amdDevice.CommandQueue);
+  
 #ifdef Track_data_transfer
 #endif
-
-//sample <=20 data from top_data and display
-  const  Dtype *top_cpu_data = (top)[0]->cpu_data();
-  size_t top_cpu_data_count = (top)[0]->count();
-  size_t sample_interval = top_cpu_data_count/20;
-  if(sample_interval == 0){ 
-     sample_interval=1;
-  }
-  for(int i=0; i<top_cpu_data_count; i+=sample_interval){
-      printf("%f\t", top_cpu_data[i]);
-  }
-  printf("\n\n");
+  
+  CHECK_BLOB_DATA(top[0], 20, "top[0]");  
 
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index b73f1a93..0e0ba213 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -32,6 +32,8 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       }
     }
   }
+
+  CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -65,16 +67,77 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
+
+  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+       //two intermediate variables to pass offset
+       this->bottom_offset_ = bottom[i]->offset(n);
+       this->top_offset_ = top[i]->offset(n); 
+       this->forward_gpu_gemm(bottom_data, weight,
+            top_data);
+
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+          this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
+
+ //Forward_cpu(bottom, top);
+   CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+   CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+       //
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(bottom_data,
+              top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm(top_diff, weight,
+              bottom_diff);
+        }
+      }
+    }
+  }
+  
+  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 7f1ac8f6..4239443d 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -70,11 +70,13 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+     Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    Backward_cpu(top, propagate_down, bottom);
 }
 
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 4d25215a..8edd6148 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -122,11 +122,13 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+     Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+     Backward_cpu(top, propagate_down, bottom);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 47fa5ed5..e49e2963 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -250,21 +250,25 @@ void LRNLayer<Dtype>::WithinChannelBackward(
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+      CrossChannelForward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+     CrossChannelBackward_gpu(top,  propagate_down, bottom);
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+      Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    Backward_cpu(top, propagate_down, bottom);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index d5207889..97a5c150 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -312,11 +312,13 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+    Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    Backward_cpu(top, propagate_down, bottom);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index e05080bf..ce85b1cc 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -39,11 +39,13 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+    Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    Backward_cpu(top, propagate_down, bottom);
 }
 
 
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 488e836a..973db6e7 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -89,11 +89,13 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+    Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+   Backward_cpu(top, propagate_down, bottom);
 }
 
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 6380f264..072f9f71 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -123,11 +123,13 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+     Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    Backward_cpu(top, propagate_down, bottom);
 }
 
 
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 932b240b..1894d0f1 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -52,11 +52,13 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+    Forward_cpu(bottom, top);
 }
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+     Backward_cpu(top, propagate_down, bottom);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
deleted file mode 100644
index a18ee638..00000000
--- a/src/caffe/net.cpp
+++ /dev/null
@@ -1,852 +0,0 @@
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/net.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/util/insert_splits.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/upgrade_proto.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-Net<Dtype>::Net(const NetParameter& param) {
-  Init(param);
-}
-
-template <typename Dtype>
-Net<Dtype>::Net(const string& param_file, Phase phase) {
-  NetParameter param;
-  ReadNetParamsFromTextFileOrDie(param_file, &param);
-  param.mutable_state()->set_phase(phase);
-  Init(param);
-}
-
-template <typename Dtype>
-void Net<Dtype>::Init(const NetParameter& in_param) {
-  // Set phase from the state.
-  phase_ = in_param.state().phase();
-  // Filter layers based on their include/exclude rules and
-  // the current NetState.
-  NetParameter filtered_param;
-  FilterNet(in_param, &filtered_param);
-  LOG(INFO) << "Initializing net from parameters: " << std::endl
-            << filtered_param.DebugString();
-  // Create a copy of filtered_param with splits added where necessary.
-  NetParameter param;
-  InsertSplits(filtered_param, &param);
-  // Basically, build all the layers and set up their connections.
-  name_ = param.name();
-  map<string, int> blob_name_to_idx;
-  set<string> available_blobs;
-  CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
-      << "Must specify either input_shape OR deprecated input_dim, not both.";
-  if (param.input_dim_size() > 0) {
-    // Deprecated 4D dimensions.
-    CHECK_EQ(param.input_size() * 4, param.input_dim_size())
-        << "Incorrect input blob dimension specifications.";
-  } else {
-    CHECK_EQ(param.input_size(), param.input_shape_size())
-        << "Exactly one input_shape must be specified per input.";
-  }
-  memory_used_ = 0;
-  // set the input blobs
-  for (int input_id = 0; input_id < param.input_size(); ++input_id) {
-    const int layer_id = -1;  // inputs have fake layer ID -1
-    AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
-  }
-  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-  // For each layer, set up its input and output
-  bottom_vecs_.resize(param.layer_size());
-  top_vecs_.resize(param.layer_size());
-  bottom_id_vecs_.resize(param.layer_size());
-  param_id_vecs_.resize(param.layer_size());
-  top_id_vecs_.resize(param.layer_size());
-  bottom_need_backward_.resize(param.layer_size());
-  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-    // Inherit phase from net if unset.
-    if (!param.layer(layer_id).has_phase()) {
-      param.mutable_layer(layer_id)->set_phase(phase_);
-    }
-    // Setup layer.
-    const LayerParameter& layer_param = param.layer(layer_id);
-    if (layer_param.propagate_down_size() > 0) {
-      CHECK_EQ(layer_param.propagate_down_size(),
-          layer_param.bottom_size())
-          << "propagate_down param must be specified "
-          << "either 0 or bottom_size times ";
-    }
-    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
-    layer_names_.push_back(layer_param.name());
-    LOG(INFO) << "Creating Layer " << layer_param.name();
-    bool need_backward = false;
-
-    // Figure out this layer's input and output
-    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
-         ++bottom_id) {
-      const int blob_id = AppendBottom(param, layer_id, bottom_id,
-                                       &available_blobs, &blob_name_to_idx);
-      // If a blob needs backward, this layer should provide it.
-      need_backward |= blob_need_backward_[blob_id];
-    }
-    int num_top = layer_param.top_size();
-    for (int top_id = 0; top_id < num_top; ++top_id) {
-      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
-    }
-    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
-    // specified fewer than the required number (as specified by
-    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
-    Layer<Dtype>* layer = layers_[layer_id].get();
-    if (layer->AutoTopBlobs()) {
-      const int needed_num_top =
-          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
-      for (; num_top < needed_num_top; ++num_top) {
-        // Add "anonymous" top blobs -- do not modify available_blobs or
-        // blob_name_to_idx as we don't want these blobs to be usable as input
-        // to other layers.
-        AppendTop(param, layer_id, num_top, NULL, NULL);
-      }
-    }
-    // After this layer is connected, set it up.
-    LOG(INFO) << "Setting up " << layer_names_[layer_id];
-    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
-        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
-      }
-      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
-      if (layer->loss(top_id)) {
-        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
-      }
-      memory_used_ += top_vecs_[layer_id][top_id]->count();
-    }
-    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-    const int param_size = layer_param.param_size();
-    const int num_param_blobs = layers_[layer_id]->blobs().size();
-    CHECK_LE(param_size, num_param_blobs)
-        << "Too many params specified for layer " << layer_param.name();
-    ParamSpec default_param_spec;
-    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-      const ParamSpec* param_spec = (param_id < param_size) ?
-          &layer_param.param(param_id) : &default_param_spec;
-      const bool param_need_backward = param_spec->lr_mult() > 0;
-      need_backward |= param_need_backward;
-      layers_[layer_id]->set_param_propagate_down(param_id,
-                                                  param_need_backward);
-    }
-    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-      AppendParam(param, layer_id, param_id);
-    }
-    // Finally, set the backward flag
-    layer_need_backward_.push_back(need_backward);
-    if (need_backward) {
-      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
-        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
-      }
-    }
-  }
-  // Go through the net backwards to determine which blobs contribute to the
-  // loss.  We can skip backward computation for blobs that don't contribute
-  // to the loss.
-  // Also checks if all bottom blobs don't need backward computation (possible
-  // because the skip_propagate_down param) and so we can skip bacward
-  // computation for the entire layer
-  set<string> blobs_under_loss;
-  set<string> blobs_skip_backp;
-  for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
-    bool layer_contributes_loss = false;
-    bool layer_skip_propagate_down = true;
-    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-      const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-      if (layers_[layer_id]->loss(top_id) ||
-          (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
-        layer_contributes_loss = true;
-      }
-      if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
-        layer_skip_propagate_down = false;
-      }
-      if (layer_contributes_loss && !layer_skip_propagate_down)
-        break;
-    }
-    // If this layer can skip backward computation, also all his bottom blobs
-    // don't need backpropagation
-    if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
-      layer_need_backward_[layer_id] = false;
-      for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-               ++bottom_id) {
-        bottom_need_backward_[layer_id][bottom_id] = false;
-      }
-    }
-    if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
-    if (layer_need_backward_[layer_id]) {
-      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-    } else {
-      LOG(INFO) << layer_names_[layer_id]
-                << " does not need backward computation.";
-    }
-    for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-         ++bottom_id) {
-      if (layer_contributes_loss) {
-        const string& blob_name =
-            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-        blobs_under_loss.insert(blob_name);
-      } else {
-        bottom_need_backward_[layer_id][bottom_id] = false;
-      }
-      if (!bottom_need_backward_[layer_id][bottom_id]) {
-        const string& blob_name =
-                   blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-        blobs_skip_backp.insert(blob_name);
-      }
-    }
-  }
-  // Handle force_backward if needed.
-  if (param.force_backward()) {
-    for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
-      layer_need_backward_[layer_id] = true;
-      for (int bottom_id = 0;
-           bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
-        bottom_need_backward_[layer_id][bottom_id] =
-            bottom_need_backward_[layer_id][bottom_id] ||
-            layers_[layer_id]->AllowForceBackward(bottom_id);
-        blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
-            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
-            bottom_need_backward_[layer_id][bottom_id];
-      }
-      for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-           ++param_id) {
-        layers_[layer_id]->set_param_propagate_down(param_id, true);
-      }
-    }
-  }
-  // In the end, all remaining blobs are considered output blobs.
-  for (set<string>::iterator it = available_blobs.begin();
-      it != available_blobs.end(); ++it) {
-    LOG(INFO) << "This network produces output " << *it;
-    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
-    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
-  }
-  for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
-    blob_names_index_[blob_names_[blob_id]] = blob_id;
-  }
-  for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
-    layer_names_index_[layer_names_[layer_id]] = layer_id;
-  }
-  GetLearningRateAndWeightDecay();
-  debug_info_ = param.debug_info();
-  LOG(INFO) << "Network initialization done.";
-  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-}
-
-template <typename Dtype>
-void Net<Dtype>::FilterNet(const NetParameter& param,
-    NetParameter* param_filtered) {
-  NetState net_state(param.state());
-  param_filtered->CopyFrom(param);
-  param_filtered->clear_layer();
-  for (int i = 0; i < param.layer_size(); ++i) {
-    const LayerParameter& layer_param = param.layer(i);
-    const string& layer_name = layer_param.name();
-    CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
-          << "Specify either include rules or exclude rules; not both.";
-    // If no include rules are specified, the layer is included by default and
-    // only excluded if it meets one of the exclude rules.
-    bool layer_included = (layer_param.include_size() == 0);
-    for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
-      if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
-        layer_included = false;
-      }
-    }
-    for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
-      if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
-        layer_included = true;
-      }
-    }
-    if (layer_included) {
-      param_filtered->add_layer()->CopyFrom(layer_param);
-    }
-  }
-}
-
-template <typename Dtype>
-bool Net<Dtype>::StateMeetsRule(const NetState& state,
-    const NetStateRule& rule, const string& layer_name) {
-  // Check whether the rule is broken due to phase.
-  if (rule.has_phase()) {
-      if (rule.phase() != state.phase()) {
-        LOG(INFO) << "The NetState phase (" << state.phase()
-          << ") differed from the phase (" << rule.phase()
-          << ") specified by a rule in layer " << layer_name;
-        return false;
-      }
-  }
-  // Check whether the rule is broken due to min level.
-  if (rule.has_min_level()) {
-    if (state.level() < rule.min_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the min_level (" << rule.min_level()
-          << ") specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to max level.
-  if (rule.has_max_level()) {
-    if (state.level() > rule.max_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the max_level (" << rule.max_level()
-          << ") specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to stage. The NetState must
-  // contain ALL of the rule's stages to meet it.
-  for (int i = 0; i < rule.stage_size(); ++i) {
-    // Check that the NetState contains the rule's ith stage.
-    bool has_stage = false;
-    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.stage(i) == state.stage(j)) { has_stage = true; }
-    }
-    if (!has_stage) {
-      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                << "' specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to not_stage. The NetState must
-  // contain NONE of the rule's not_stages to meet it.
-  for (int i = 0; i < rule.not_stage_size(); ++i) {
-    // Check that the NetState contains the rule's ith not_stage.
-    bool has_stage = false;
-    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
-    }
-    if (has_stage) {
-      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                << "' specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  return true;
-}
-
-// Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
-// layer_id == -1, tops have layer_id >= 0.)
-template <typename Dtype>
-void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
-                           const int top_id, set<string>* available_blobs,
-                           map<string, int>* blob_name_to_idx) {
-  shared_ptr<LayerParameter> layer_param((layer_id >= 0) ?
-    (new LayerParameter(param.layer(layer_id))) : NULL);
-  const string& blob_name = layer_param ?
-      (layer_param->top_size() > top_id ?
-          layer_param->top(top_id) : "(automatic)") : param.input(top_id);
-  // Check if we are doing in-place computation
-  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
-      blob_name == layer_param->bottom(top_id)) {
-    // In-place computation
-    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
-    top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
-    top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
-  } else if (blob_name_to_idx &&
-             blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
-    // If we are not doing in-place computation but have duplicated blobs,
-    // raise an error.
-    LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
-  } else {
-    // Normal output.
-    if (layer_param) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name;
-    } else {
-      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
-    }
-    shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
-    const int blob_id = blobs_.size();
-    blobs_.push_back(blob_pointer);
-    blob_names_.push_back(blob_name);
-    blob_need_backward_.push_back(false);
-    if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; }
-    if (layer_id == -1) {
-      // Set the (explicitly specified) dimensions of the input blob.
-      if (param.input_dim_size() > 0) {
-        blob_pointer->Reshape(param.input_dim(top_id * 4),
-                              param.input_dim(top_id * 4 + 1),
-                              param.input_dim(top_id * 4 + 2),
-                              param.input_dim(top_id * 4 + 3));
-      } else {
-        blob_pointer->Reshape(param.input_shape(top_id));
-      }
-      net_input_blob_indices_.push_back(blob_id);
-      net_input_blobs_.push_back(blob_pointer.get());
-    } else {
-      top_id_vecs_[layer_id].push_back(blob_id);
-      top_vecs_[layer_id].push_back(blob_pointer.get());
-    }
-  }
-  if (available_blobs) { available_blobs->insert(blob_name); }
-}
-
-// Helper for Net::Init: add a new bottom blob to the net.
-template <typename Dtype>
-int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
-    const int bottom_id, set<string>* available_blobs,
-    map<string, int>* blob_name_to_idx) {
-  const LayerParameter& layer_param = param.layer(layer_id);
-  const string& blob_name = layer_param.bottom(bottom_id);
-  if (available_blobs->find(blob_name) == available_blobs->end()) {
-    LOG(FATAL) << "Unknown blob input " << blob_name
-               << " (at index " << bottom_id << ") to layer " << layer_id;
-  }
-  const int blob_id = (*blob_name_to_idx)[blob_name];
-  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
-  bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
-  bottom_id_vecs_[layer_id].push_back(blob_id);
-  available_blobs->erase(blob_name);
-  bool propagate_down = true;
-  // Check if the backpropagation on bottom_id should be skipped
-  if (layer_param.propagate_down_size() > 0)
-    propagate_down = layer_param.propagate_down(bottom_id);
-  const bool need_backward = blob_need_backward_[blob_id] &&
-                          propagate_down;
-  bottom_need_backward_[layer_id].push_back(need_backward);
-  return blob_id;
-}
-
-template <typename Dtype>
-void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
-                             const int param_id) {
-  const LayerParameter& layer_param = layers_[layer_id]->layer_param();
-  const int param_size = layer_param.param_size();
-  string param_name =
-      (param_size > param_id) ? layer_param.param(param_id).name() : "";
-  if (param_name.size()) {
-    param_display_names_.push_back(param_name);
-  } else {
-    ostringstream param_display_name;
-    param_display_name << param_id;
-    param_display_names_.push_back(param_display_name.str());
-  }
-  const int net_param_id = params_.size();
-  params_.push_back(layers_[layer_id]->blobs()[param_id]);
-  param_id_vecs_[layer_id].push_back(net_param_id);
-  param_layer_indices_.push_back(make_pair(layer_id, param_id));
-  if (!param_size || !param_name.size() || (param_name.size() &&
-      param_names_index_.find(param_name) == param_names_index_.end())) {
-    // This layer "owns" this parameter blob -- it is either anonymous
-    // (i.e., not given a param_name) or explicitly given a name that we
-    // haven't already seen.
-    param_owners_.push_back(-1);
-    if (param_name.size()) {
-      param_names_index_[param_name] = net_param_id;
-    }
-  } else {
-    // Named param blob with name we've seen before: share params
-    const int owner_net_param_id = param_names_index_[param_name];
-    param_owners_.push_back(owner_net_param_id);
-    const pair<int, int>& owner_index =
-        param_layer_indices_[owner_net_param_id];
-    const int owner_layer_id = owner_index.first;
-    const int owner_param_id = owner_index.second;
-    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-              << "layer '" << layer_names_[owner_layer_id] << "', param "
-              << "index " << owner_param_id;
-    Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
-    Blob<Dtype>* owner_blob =
-        layers_[owner_layer_id]->blobs()[owner_param_id].get();
-    const int param_size = layer_param.param_size();
-    if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
-                                  ParamSpec_DimCheckMode_PERMISSIVE)) {
-      // Permissive dimension checking -- only check counts are the same.
-      CHECK_EQ(this_blob->count(), owner_blob->count())
-          << "Shared parameter blobs must have the same count.";
-    } else {
-      // Strict dimension checking -- all dims must be the same.
-      CHECK(this_blob->shape() == owner_blob->shape());
-    }
-    layers_[layer_id]->blobs()[param_id]->ShareData(
-        *layers_[owner_layer_id]->blobs()[owner_param_id]);
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::GetLearningRateAndWeightDecay() {
-  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
-  ParamSpec default_param_spec;
-  for (int i = 0; i < layers_.size(); ++i) {
-    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
-    for (int j = 0; j < layer_blobs.size(); ++j) {
-      const ParamSpec* param_spec =
-          (layers_[i]->layer_param().param_size() > j) ?
-          &layers_[i]->layer_param().param(j) : &default_param_spec;
-      params_lr_.push_back(param_spec->lr_mult());
-      params_weight_decay_.push_back(param_spec->decay_mult());
-    }
-  }
-}
-
-template <typename Dtype>
-Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
-  CHECK_GE(start, 0);
-  CHECK_LT(end, layers_.size());
-  Dtype loss = 0;
-  if (debug_info_) {
-    for (int i = 0; i < net_input_blobs_.size(); ++i) {
-      InputDebugInfo(i);
-    }
-  }
-  for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
-    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
-    loss += layer_loss;
-    if (debug_info_) { ForwardDebugInfo(i); }
-  }
-  return loss;
-}
-
-template <typename Dtype>
-Dtype Net<Dtype>::ForwardFrom(int start) {
-  return ForwardFromTo(start, layers_.size() - 1);
-}
-
-template <typename Dtype>
-Dtype Net<Dtype>::ForwardTo(int end) {
-  return ForwardFromTo(0, end);
-}
-
-template <typename Dtype>
-const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
-  if (loss != NULL) {
-    *loss = ForwardFromTo(0, layers_.size() - 1);
-  } else {
-    ForwardFromTo(0, layers_.size() - 1);
-  }
-  return net_output_blobs_;
-}
-
-template <typename Dtype>
-const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
-    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
-  // Copy bottom to internal bottom
-  for (int i = 0; i < bottom.size(); ++i) {
-    net_input_blobs_[i]->CopyFrom(*bottom[i]);
-  }
-  return ForwardPrefilled(loss);
-}
-
-template <typename Dtype>
-string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
-  BlobProtoVector blob_proto_vec;
-  if (net_input_blobs_.size()) {
-    blob_proto_vec.ParseFromString(input_blob_protos);
-    CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
-        << "Incorrect input size.";
-    for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
-      net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
-    }
-  }
-  ForwardPrefilled(loss);
-  blob_proto_vec.Clear();
-  for (int i = 0; i < net_output_blobs_.size(); ++i) {
-    net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
-  }
-  string output;
-  blob_proto_vec.SerializeToString(&output);
-  return output;
-}
-
-template <typename Dtype>
-void Net<Dtype>::BackwardFromTo(int start, int end) {
-  CHECK_GE(end, 0);
-  CHECK_LT(start, layers_.size());
-  for (int i = start; i >= end; --i) {
-    if (layer_need_backward_[i]) {
-      layers_[i]->Backward(
-          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
-      if (debug_info_) { BackwardDebugInfo(i); }
-    }
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::InputDebugInfo(const int input_id) {
-  const Blob<Dtype>& blob = *net_input_blobs_[input_id];
-  const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
-  const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  LOG(INFO) << "    [Forward] "
-     << "Input " << blob_name << " data: " << data_abs_val_mean;
-}
-
-template <typename Dtype>
-void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
-  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
-    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-       << " data: " << data_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const int net_param_id = param_id_vecs_[layer_id][param_id];
-    const string& blob_name = param_display_names_[net_param_id];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-       << " data: " << data_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
-  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
-  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
-    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
-    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
-    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-        << " diff: " << diff_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::UpdateDebugInfo(const int param_id) {
-  const Blob<Dtype>& blob = *params_[param_id];
-  const int param_owner = param_owners_[param_id];
-  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
-  const string& param_display_name = param_display_names_[param_id];
-  const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-  if (param_owner < 0) {
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param " << param_display_name
-        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
-  } else {
-    const string& owner_layer_name =
-        layer_names_[param_layer_indices_[param_owner].first];
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param blob " << param_display_name
-        << " (owned by layer " << owner_layer_name << ", "
-        << "param " << param_display_names_[param_owners_[param_id]] << ")"
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
-  int num_source_layers = other->layers().size();
-  for (int i = 0; i < num_source_layers; ++i) {
-    Layer<Dtype>* source_layer = other->layers()[i].get();
-    const string& source_layer_name = other->layer_names()[i];
-    int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
-      ++target_layer_id;
-    }
-    if (target_layer_id == layer_names_.size()) {
-      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-      continue;
-    }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-        layers_[target_layer_id]->blobs();
-    CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
-        << "Incompatible number of blobs for layer " << source_layer_name;
-    for (int j = 0; j < target_blobs.size(); ++j) {
-      Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
-      CHECK(target_blobs[j]->shape() == source_blob->shape());
-      target_blobs[j]->ShareData(*source_blob);
-    }
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::BackwardFrom(int start) {
-  BackwardFromTo(start, 0);
-}
-
-template <typename Dtype>
-void Net<Dtype>::BackwardTo(int end) {
-  BackwardFromTo(layers_.size() - 1, end);
-}
-
-template <typename Dtype>
-void Net<Dtype>::Backward() {
-  BackwardFromTo(layers_.size() - 1, 0);
-  if (debug_info_) {
-    Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
-    for (int i = 0; i < params_.size(); ++i) {
-      if (param_owners_[i] >= 0) { continue; }
-      asum_data += params_[i]->asum_data();
-      asum_diff += params_[i]->asum_diff();
-      sumsq_data += params_[i]->sumsq_data();
-      sumsq_diff += params_[i]->sumsq_diff();
-    }
-    const Dtype l2norm_data = std::sqrt(sumsq_data);
-    const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-    LOG(ERROR) << "    [Backward] All net params (data, diff): "
-        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::Reshape() {
-  for (int i = 0; i < layers_.size(); ++i) {
-    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
-  int num_source_layers = param.layer_size();
-  for (int i = 0; i < num_source_layers; ++i) {
-    const LayerParameter& source_layer = param.layer(i);
-    const string& source_layer_name = source_layer.name();
-    int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
-      ++target_layer_id;
-    }
-    if (target_layer_id == layer_names_.size()) {
-      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-      continue;
-    }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-        layers_[target_layer_id]->blobs();
-    CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
-        << "Incompatible number of blobs for layer " << source_layer_name;
-    for (int j = 0; j < target_blobs.size(); ++j) {
-      const bool kReshape = false;
-      target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
-    }
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  NetParameter param;
-  ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
-  CopyTrainedLayersFrom(param);
-}
-
-template <typename Dtype>
-void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
-  param->Clear();
-  param->set_name(name_);
-  // Add bottom and top
-  for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
-    param->add_input(blob_names_[net_input_blob_indices_[i]]);
-  }
-  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
-  for (int i = 0; i < layers_.size(); ++i) {
-    LayerParameter* layer_param = param->add_layer();
-    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
-      layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
-    }
-    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
-      layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
-    }
-    layers_[i]->ToProto(layer_param, write_diff);
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::Update() {
-  // First, accumulate the diffs of any shared parameters into their owner's
-  // diff. (Assumes that the learning rate, weight decay, etc. have already been
-  // accounted for in the current diff.)
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    const int count = params_[i]->count();
-    const Dtype* this_diff;
-    Dtype* owner_diff;
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
-      break;
-    case Caffe::GPU:
-#ifndef CPU_ONLY
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-#else
-      NO_GPU;
-#endif
-      break;
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
-  }
-  // Now, update the owned parameters.
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] >= 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    params_[i]->Update();
-  }
-}
-
-template <typename Dtype>
-bool Net<Dtype>::has_blob(const string& blob_name) const {
-  return blob_names_index_.find(blob_name) != blob_names_index_.end();
-}
-
-template <typename Dtype>
-const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
-    const string& blob_name) const {
-  shared_ptr<Blob<Dtype> > blob_ptr;
-  if (has_blob(blob_name)) {
-    blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
-  } else {
-    blob_ptr.reset((Blob<Dtype>*)(NULL));
-    LOG(WARNING) << "Unknown blob name " << blob_name;
-  }
-  return blob_ptr;
-}
-
-template <typename Dtype>
-bool Net<Dtype>::has_layer(const string& layer_name) const {
-  return layer_names_index_.find(layer_name) != layer_names_index_.end();
-}
-
-template <typename Dtype>
-const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
-    const string& layer_name) const {
-  shared_ptr<Layer<Dtype> > layer_ptr;
-  if (has_layer(layer_name)) {
-    layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
-  } else {
-    layer_ptr.reset((Layer<Dtype>*)(NULL));
-    LOG(WARNING) << "Unknown layer name " << layer_name;
-  }
-  return layer_ptr;
-}
-
-INSTANTIATE_CLASS(Net);
-
-}  // namespace caffe
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index bbac8fb5..87f746d8 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -37,6 +37,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
 //#ifndef CPU_ONLY
   //AMD device related initialization
   amdDevice.Init();
+//  cl_int err =  clblasSetup();
 //#else
 //  NO_GPU;
 //#endif
@@ -519,6 +520,7 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
 #ifndef CPU_ONLY
     caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
         net_params[param_id]->mutable_gpu_diff());
+    CHECK_BLOB_DATA(net_params[param_id], 10, "NORM");
 #else
     NO_GPU;
 #endif
@@ -537,6 +539,15 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
   Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+ 
+ Dtype *cpu_diff =  net_params[param_id]->mutable_cpu_diff();
+  printf("cpu diff before reg\n");
+  for(int i=0; i<10; i++)
+       printf("%f,",cpu_diff[i]);
+  printf("\n");
+
+ 
+
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {
@@ -589,6 +600,18 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   default:
     LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
   }
+  CHECK_BLOB_DATA(net_params[param_id], 10, "REGU");
+ cpu_diff =  net_params[param_id]->mutable_cpu_diff();
+  printf("cpu diff\n");
+  for(int i=0; i<10; i++)
+       printf("%f,",cpu_diff[i]);
+  printf("\n");
+
+ cpu_diff =  temp_[param_id]->mutable_cpu_diff();
+  printf("tmp\n");
+  for(int i=0; i<10; i++)
+       printf("%f,",cpu_diff[i]);
+  printf("\n");
 }
 
 template <typename Dtype>
@@ -613,9 +636,11 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
               net_params[param_id]->gpu_diff(), momentum,
               history_[param_id]->mutable_gpu_data());
-    caffe_copy(net_params[param_id]->count(),
+    caffe_gpu_copy(net_params[param_id]->count(),
         history_[param_id]->gpu_data(),
         net_params[param_id]->mutable_gpu_diff());
+
+CHECK_BLOB_DATA(net_params[param_id], 10, "COMPUTATE");
 #else
     NO_GPU;
 #endif
@@ -693,7 +718,7 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->update_[param_id]->mutable_gpu_data());
 
     // copy
-    caffe_copy(net_params[param_id]->count(),
+    caffe_gpu_copy(net_params[param_id]->count(),
         this->update_[param_id]->gpu_data(),
         net_params[param_id]->mutable_gpu_diff());
 #else
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 1d269c35..a8c5a83f 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -15,8 +15,8 @@ Timer::Timer()
 Timer::~Timer() {
   if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventDestroy(start_gpu_));
-    CUDA_CHECK(cudaEventDestroy(stop_gpu_));
+   // CUDA_CHECK(cudaEventDestroy(start_gpu_));
+   // CUDA_CHECK(cudaEventDestroy(stop_gpu_));
 #else
     NO_GPU;
 #endif
@@ -108,8 +108,8 @@ void Timer::Init() {
   if (!initted()) {
     if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventCreate(&start_gpu_));
-      CUDA_CHECK(cudaEventCreate(&stop_gpu_));
+     // CUDA_CHECK(cudaEventCreate(&start_gpu_));
+     // CUDA_CHECK(cudaEventCreate(&stop_gpu_));
 #else
       NO_GPU;
 #endif
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 6545d98c..ac44f425 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -2,6 +2,7 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "caffe/common.hpp"
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
 
@@ -81,13 +82,14 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int stride_w, double* data_im);
 
 
-
+/*
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     Dtype* data_col) {
+   
 }
 
 
@@ -100,8 +102,8 @@ template void im2col_gpu<double>(const double* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     double* data_col);
-
-
+*/
+/*
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
@@ -118,5 +120,243 @@ template void col2im_gpu<double>(const double* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
+*/
+template <typename Dtype>
+void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height_col * width_col;
+    
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+    clFinish(amdDevice.CommandQueue);
+}
+
+template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, float* data_col, const int col_offset);
+template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, double* data_col, const int col_offset);
+
+template <typename Dtype>
+void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = 16 * channels * height_col * width_col;
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256 - 256 % width_col};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+template void im2col_16_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, float* data_col, const int col_offset);
+template void im2col_16_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, double* data_col, const int col_offset);
+
+template <typename Dtype>
+void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, const int col_offset, const int optnum) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = optnum * channels * height_col * width_col;
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset);
+    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256 - 256 % width_col};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+template void im2col_opt_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, float* data_col, const int col_offset, const int optnum);
+template void im2col_opt_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, double* data_col, const int col_offset, const int optnum);
+
+template <typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, const int img_offset) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operatiors)
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+
+template void col2im_gpu<float>(cl_kernel Kernel, const float* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, float* data_im, const int img_offset);
+template void col2im_gpu<double>(cl_kernel Kernel, const double* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, double* data_im, const int img_offset);
+
+template <typename Dtype>
+void im2col_gpu_ocl(cl_mem data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_col, cl_kernel Kernel) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height_col * width_col;
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col);
+    OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) );
+
+    //std::cout<<"num_kernels"<<num_kernels<<" data_im"<<data_im<<" height"<<height<<" width"<<width<<" ksize"<<ksize<<" pad"<<pad<<" stride"<<stride<<" height_col"<<height_col<<" width_col"<<width_col<<" data_col"<<data_col<<std::endl;
+    if(ret!=CL_SUCCESS){
+        fprintf(stderr,"Failed to Set Args\n");
+    }
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {64};
+    cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL);
+    if(CL_SUCCESS!=iStatus){
+        fprintf(stderr,"Failed to enqueue kernel\n");
+    }
+}
+
+template void im2col_gpu_ocl<float>(cl_mem data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, float* data_col, cl_kernel Kernel);
+template void im2col_gpu_ocl<double>(cl_mem data_im, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, double* data_col, cl_kernel Kernel);
+
+template <typename Dtype>
+void col2im_gpu_ocl(cl_mem data_col, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, cl_kernel Kernel) {
+
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operatiors)
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_im);
+
+    if(ret!=CL_SUCCESS){
+        fprintf(stderr,"Failed to Set Args\n");
+    }
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {64};
+    cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL);
+    if(CL_SUCCESS!=iStatus){
+        fprintf(stderr,"Failed to enqueue kernel\n");
+    }
+}
+
+
+template void col2im_gpu_ocl<float>(cl_mem data_col, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, float* data_im, cl_kernel Kernel);
+template void col2im_gpu_ocl<double>(cl_mem data_col, const int channels,
+    const int height, const int width, const int psize, const int pad,
+    const int stride, double* data_im, cl_kernel Kernel);
 
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 17c2b414..cf9b1ca5 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -542,9 +542,9 @@ template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 }
 
-DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-                                      - (x[index] < Dtype(0)));
-DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
+                                    //  - (x[index] < Dtype(0)));
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
 
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit);
@@ -609,6 +609,18 @@ void mul_kernel(const int n, const Dtype* a,
     const Dtype* b, Dtype* y) {
 }
 
+template<>
+void caffe_gpu_sign<float>(const int N, const float *X, float *Y){
+   cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
+   caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
+}
+
+template<>
+void caffe_gpu_sign<double>(const int N, const double *X, double *Y){
+   cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
+   caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
+}
+
 template <>
 void caffe_gpu_mul<float>(const int N, const float* a,
     const float* b, float* y) {
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
new file mode 100644
index 00000000..32a477fc
--- /dev/null
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -0,0 +1,447 @@
+// Copyright 2014 AMD DNN contributors.
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <stdlib.h>
+#include <stdio.h>
+#include "caffe/common.hpp"
+#include "caffe/util/ocl_util.hpp"
+namespace caffe {
+
+template <typename Dtype>
+void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){
+    cl_int ret;
+    ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src);
+    OCL_CHECK(ret);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst);
+    OCL_CHECK(ret);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset);
+    OCL_CHECK(ret);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_);
+    OCL_CHECK(ret);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_);
+    OCL_CHECK(ret);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size2[]={M_ * packing_num};
+    size_t uiLocal_Work_Size2[]={256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) );
+}
+
+template void transform_gpu<float>(cl_kernel Kernel, float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+template void transform_gpu<double>(cl_kernel Kernel, double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+
+template <typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) );
+ 
+    size_t Global_Work_Size[1] = {num};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+// Explicit instantiation
+template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data);
+template void get_max_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data);
+
+
+template <typename Dtype>
+void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {num};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+// Explicit instantiation
+template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data, float* out);
+template void exp_gpu<double>(cl_kernel Kernel, const int num, const double* data, double* out);
+
+template <typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
+
+    size_t Global_Work_Size[1] = {num*dim};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+// Explicit instantiation
+template void softmax_div_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data);
+template void softmax_div_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data);
+
+template <typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss){
+
+    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem),     (void*)&prob_data));
+    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&d_loss));
+    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),   (void*)&label));
+    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int),   (void*)&num));
+    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),   (void*)&dim));
+    OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype),    NULL));
+
+    size_t globalws[1] = {256};
+    size_t localws[1] = {256};
+    OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, localws, 0, NULL, NULL) );
+    void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
+    Dtype loss = *(Dtype*)h_loss;
+    clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, NULL);
+    
+    return loss;
+}
+
+// Explicit instantiation
+template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss);
+template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
+
+template <typename Dtype>
+void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
+
+    size_t Global_Work_Size[1] = {num};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+// Explicit instantiation
+template void scal_gpu<float>(cl_kernel Kernel, const int num, const float alpha, float* data);
+template void scal_gpu<double>(cl_kernel Kernel, const int num, const double alpha, double* data);
+
+template <typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype* label){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) );
+
+    size_t Global_Work_Size[1] = {num};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+// Explicit instantiation
+template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim, float* data, const float* label);
+template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim, double* data, const double* label);
+
+template <typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template  void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data);
+template  void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data);
+
+template <typename Dtype> 
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
+    ret |= clSetKernelArg(Kernel, 10,sizeof(cl_int), (void*)&pad_);
+    ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {count * 1};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* top_data);
+template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_,const int stride_,const int pad_, double* top_data);
+
+template <typename Dtype> 
+void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_size_);
+    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_);
+    ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {count};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff);
+template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff );
+
+template <typename Dtype> 
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&pad_);
+    ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[]={count};
+    size_t uiLocal_Work_Size[]={256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL));
+}
+
+template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff);
+template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
+
+template <typename Dtype> 
+void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void Relu_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data);
+template void Relu_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data);
+
+template <typename Dtype> 
+void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void Relu_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff);
+template void Relu_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {N};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign<float>(cl_kernel Kernel,const int N,  const float* X, float* Y );
+template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double* X, double* Y );
+
+template <typename Dtype>
+void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_div<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_div<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const Dtype alpha, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add_scalar<float> (cl_kernel Kernel, const int n, const float alpha, float* y);
+template void caffe_gpu_add_scalar<double> (cl_kernel Kernel, const int n, const double alpha, double* y);
+
+template <typename Dtype>
+void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_mul<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float> (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y);
+template void caffe_gpu_powx<double> (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y);
+
+template <typename Dtype>
+void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
+{
+    cl_int ret;
+    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
+    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
+    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
+    ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); 
+    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); 
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void Dropout_fp_gpu<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
+template void Dropout_fp_gpu<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+
+template <typename Dtype>
+void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+{
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
+    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_); 
+    ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); 
+    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff); 
+    OCL_CHECK(ret);
+   
+    size_t Global_Work_Size[] = {count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Dropout_bp_gpu<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
+template void Dropout_bp_gpu<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
+
+typedef unsigned int uint32_t;
+struct array4x32 {  uint32_t v[4]; };
+template <typename Dtype>
+void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+        
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float),   (void*)&threshold);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_bernoulli<float>(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold);
+template void caffe_gpu_bernoulli<double>(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold);
+
+
+template <typename Dtype>
+void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
+    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
+
+    int num_kernels = channels * height * width * optnum;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operatiors)
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+template void opttrans<float>(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels,
+    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
+template void opttrans<double>(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels,
+    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
+
+
+}  // namespace caffe
+

From c0ff752749500f0f3b992ec1d0bc6f3fb15c7fdf Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 16 Jul 2015 16:00:22 +0800
Subject: [PATCH 008/124] Debugging layer. Not much change for layers

---
 include/caffe/common.hpp        |  30 ++
 src/caffe/OCL_kernel.cl         |   2 +-
 src/caffe/layers/conv_layer.cpp |   9 +-
 src/caffe/layers/lrn_layer.cpp  |   2 +-
 src/caffe/net.cpp               | 864 ++++++++++++++++++++++++++++++++
 src/caffe/solver.cpp            |  23 -
 6 files changed, 903 insertions(+), 27 deletions(-)
 create mode 100644 src/caffe/net.cpp

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index e0703056..debc73a3 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -105,6 +105,36 @@ do{ \
   printf("\n\n"); \
 }while(0)
 
+#define CHECK_GLOBAL_MEM_DATA(global_mem, count, num, marker)\
+do{ \
+  Dtype *global_mem_cpu = new Dtype[count]; \
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \
+              CL_TRUE, 0, sizeof(Dtype)*count, global_mem_cpu,0, NULL, NULL); \
+  size_t sample_interval = count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<count; i+=sample_interval){ \
+      printf("%f  ", global_mem_cpu[i]); \
+  } \
+  printf("\n\n"); \
+  delete []global_mem_cpu; \
+}while(0)
+
+#define CHECK_CPU_MEM_DATA(cpu_mem, count, num, marker)\
+do{ \
+  size_t sample_interval = count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<count; i+=sample_interval){ \
+      printf("%f  ", cpu_mem[i]); \
+  } \
+  printf("\n\n"); \
+}while(0)
+
 // See PR #1236
 namespace cv { class Mat; }
 
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index d132efe8..8d497ced 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -747,7 +747,7 @@ template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
      int gdx = get_global_id(0);
      if(gdx < N){
-          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
+          Y[gdx] =((0.0 < X[gdx])-(X[gdx] < 0.0));
      }
 }
 
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 0e0ba213..aa2debdf 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -67,8 +67,10 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
+  CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
+  CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+  CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff");
 
-  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 template <typename Dtype>
@@ -137,7 +139,10 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
   
-  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
+  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
+  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
+ // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index e49e2963..2dc18595 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -256,7 +256,7 @@ void LRNLayer<Dtype>::CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-     CrossChannelBackward_gpu(top,  propagate_down, bottom);
+     CrossChannelBackward_cpu(top,  propagate_down, bottom);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
new file mode 100644
index 00000000..4de7a146
--- /dev/null
+++ b/src/caffe/net.cpp
@@ -0,0 +1,864 @@
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/insert_splits.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/upgrade_proto.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+Net<Dtype>::Net(const NetParameter& param) {
+  Init(param);
+}
+
+template <typename Dtype>
+Net<Dtype>::Net(const string& param_file, Phase phase) {
+  NetParameter param;
+  ReadNetParamsFromTextFileOrDie(param_file, &param);
+  param.mutable_state()->set_phase(phase);
+  Init(param);
+}
+
+template <typename Dtype>
+void Net<Dtype>::Init(const NetParameter& in_param) {
+  // Set phase from the state.
+  phase_ = in_param.state().phase();
+  // Filter layers based on their include/exclude rules and
+  // the current NetState.
+  NetParameter filtered_param;
+  FilterNet(in_param, &filtered_param);
+  LOG(INFO) << "Initializing net from parameters: " << std::endl
+            << filtered_param.DebugString();
+  // Create a copy of filtered_param with splits added where necessary.
+  NetParameter param;
+  InsertSplits(filtered_param, &param);
+  // Basically, build all the layers and set up their connections.
+  name_ = param.name();
+  map<string, int> blob_name_to_idx;
+  set<string> available_blobs;
+  CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
+      << "Must specify either input_shape OR deprecated input_dim, not both.";
+  if (param.input_dim_size() > 0) {
+    // Deprecated 4D dimensions.
+    CHECK_EQ(param.input_size() * 4, param.input_dim_size())
+        << "Incorrect input blob dimension specifications.";
+  } else {
+    CHECK_EQ(param.input_size(), param.input_shape_size())
+        << "Exactly one input_shape must be specified per input.";
+  }
+  memory_used_ = 0;
+  // set the input blobs
+  for (int input_id = 0; input_id < param.input_size(); ++input_id) {
+    const int layer_id = -1;  // inputs have fake layer ID -1
+    AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
+  }
+  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  // For each layer, set up its input and output
+  bottom_vecs_.resize(param.layer_size());
+  top_vecs_.resize(param.layer_size());
+  bottom_id_vecs_.resize(param.layer_size());
+  param_id_vecs_.resize(param.layer_size());
+  top_id_vecs_.resize(param.layer_size());
+  bottom_need_backward_.resize(param.layer_size());
+  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
+    // Inherit phase from net if unset.
+    if (!param.layer(layer_id).has_phase()) {
+      param.mutable_layer(layer_id)->set_phase(phase_);
+    }
+    // Setup layer.
+    const LayerParameter& layer_param = param.layer(layer_id);
+    if (layer_param.propagate_down_size() > 0) {
+      CHECK_EQ(layer_param.propagate_down_size(),
+          layer_param.bottom_size())
+          << "propagate_down param must be specified "
+          << "either 0 or bottom_size times ";
+    }
+    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
+    layer_names_.push_back(layer_param.name());
+    LOG(INFO) << "Creating Layer " << layer_param.name();
+    bool need_backward = false;
+
+    // Figure out this layer's input and output
+    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
+         ++bottom_id) {
+      const int blob_id = AppendBottom(param, layer_id, bottom_id,
+                                       &available_blobs, &blob_name_to_idx);
+      // If a blob needs backward, this layer should provide it.
+      need_backward |= blob_need_backward_[blob_id];
+    }
+    int num_top = layer_param.top_size();
+    for (int top_id = 0; top_id < num_top; ++top_id) {
+      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
+    }
+    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
+    // specified fewer than the required number (as specified by
+    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
+    Layer<Dtype>* layer = layers_[layer_id].get();
+    if (layer->AutoTopBlobs()) {
+      const int needed_num_top =
+          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
+      for (; num_top < needed_num_top; ++num_top) {
+        // Add "anonymous" top blobs -- do not modify available_blobs or
+        // blob_name_to_idx as we don't want these blobs to be usable as input
+        // to other layers.
+        AppendTop(param, layer_id, num_top, NULL, NULL);
+      }
+    }
+    // After this layer is connected, set it up.
+    LOG(INFO) << "Setting up " << layer_names_[layer_id];
+    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
+    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
+        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
+      }
+      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
+      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
+      if (layer->loss(top_id)) {
+        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
+      }
+      memory_used_ += top_vecs_[layer_id][top_id]->count();
+    }
+    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+    const int param_size = layer_param.param_size();
+    const int num_param_blobs = layers_[layer_id]->blobs().size();
+    CHECK_LE(param_size, num_param_blobs)
+        << "Too many params specified for layer " << layer_param.name();
+    ParamSpec default_param_spec;
+    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+      const ParamSpec* param_spec = (param_id < param_size) ?
+          &layer_param.param(param_id) : &default_param_spec;
+      const bool param_need_backward = param_spec->lr_mult() > 0;
+      need_backward |= param_need_backward;
+      layers_[layer_id]->set_param_propagate_down(param_id,
+                                                  param_need_backward);
+    }
+    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+      AppendParam(param, layer_id, param_id);
+    }
+    // Finally, set the backward flag
+    layer_need_backward_.push_back(need_backward);
+    if (need_backward) {
+      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
+        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
+      }
+    }
+  }
+  // Go through the net backwards to determine which blobs contribute to the
+  // loss.  We can skip backward computation for blobs that don't contribute
+  // to the loss.
+  // Also checks if all bottom blobs don't need backward computation (possible
+  // because the skip_propagate_down param) and so we can skip bacward
+  // computation for the entire layer
+  set<string> blobs_under_loss;
+  set<string> blobs_skip_backp;
+  for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
+    bool layer_contributes_loss = false;
+    bool layer_skip_propagate_down = true;
+    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+      const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+      if (layers_[layer_id]->loss(top_id) ||
+          (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
+        layer_contributes_loss = true;
+      }
+      if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
+        layer_skip_propagate_down = false;
+      }
+      if (layer_contributes_loss && !layer_skip_propagate_down)
+        break;
+    }
+    // If this layer can skip backward computation, also all his bottom blobs
+    // don't need backpropagation
+    if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
+      layer_need_backward_[layer_id] = false;
+      for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+               ++bottom_id) {
+        bottom_need_backward_[layer_id][bottom_id] = false;
+      }
+    }
+    if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
+    if (layer_need_backward_[layer_id]) {
+      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
+    } else {
+      LOG(INFO) << layer_names_[layer_id]
+                << " does not need backward computation.";
+    }
+    for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+         ++bottom_id) {
+      if (layer_contributes_loss) {
+        const string& blob_name =
+            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+        blobs_under_loss.insert(blob_name);
+      } else {
+        bottom_need_backward_[layer_id][bottom_id] = false;
+      }
+      if (!bottom_need_backward_[layer_id][bottom_id]) {
+        const string& blob_name =
+                   blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+        blobs_skip_backp.insert(blob_name);
+      }
+    }
+  }
+  // Handle force_backward if needed.
+  if (param.force_backward()) {
+    for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
+      layer_need_backward_[layer_id] = true;
+      for (int bottom_id = 0;
+           bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
+        bottom_need_backward_[layer_id][bottom_id] =
+            bottom_need_backward_[layer_id][bottom_id] ||
+            layers_[layer_id]->AllowForceBackward(bottom_id);
+        blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
+            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
+            bottom_need_backward_[layer_id][bottom_id];
+      }
+      for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+           ++param_id) {
+        layers_[layer_id]->set_param_propagate_down(param_id, true);
+      }
+    }
+  }
+  // In the end, all remaining blobs are considered output blobs.
+  for (set<string>::iterator it = available_blobs.begin();
+      it != available_blobs.end(); ++it) {
+    LOG(INFO) << "This network produces output " << *it;
+    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
+    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
+  }
+  for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
+    blob_names_index_[blob_names_[blob_id]] = blob_id;
+  }
+  for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
+    layer_names_index_[layer_names_[layer_id]] = layer_id;
+  }
+  GetLearningRateAndWeightDecay();
+  debug_info_ = param.debug_info();
+  LOG(INFO) << "Network initialization done.";
+  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+}
+
+template <typename Dtype>
+void Net<Dtype>::FilterNet(const NetParameter& param,
+    NetParameter* param_filtered) {
+  NetState net_state(param.state());
+  param_filtered->CopyFrom(param);
+  param_filtered->clear_layer();
+  for (int i = 0; i < param.layer_size(); ++i) {
+    const LayerParameter& layer_param = param.layer(i);
+    const string& layer_name = layer_param.name();
+    CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
+          << "Specify either include rules or exclude rules; not both.";
+    // If no include rules are specified, the layer is included by default and
+    // only excluded if it meets one of the exclude rules.
+    bool layer_included = (layer_param.include_size() == 0);
+    for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
+      if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
+        layer_included = false;
+      }
+    }
+    for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
+      if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
+        layer_included = true;
+      }
+    }
+    if (layer_included) {
+      param_filtered->add_layer()->CopyFrom(layer_param);
+    }
+  }
+}
+
+template <typename Dtype>
+bool Net<Dtype>::StateMeetsRule(const NetState& state,
+    const NetStateRule& rule, const string& layer_name) {
+  // Check whether the rule is broken due to phase.
+  if (rule.has_phase()) {
+      if (rule.phase() != state.phase()) {
+        LOG(INFO) << "The NetState phase (" << state.phase()
+          << ") differed from the phase (" << rule.phase()
+          << ") specified by a rule in layer " << layer_name;
+        return false;
+      }
+  }
+  // Check whether the rule is broken due to min level.
+  if (rule.has_min_level()) {
+    if (state.level() < rule.min_level()) {
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the min_level (" << rule.min_level()
+          << ") specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to max level.
+  if (rule.has_max_level()) {
+    if (state.level() > rule.max_level()) {
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the max_level (" << rule.max_level()
+          << ") specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to stage. The NetState must
+  // contain ALL of the rule's stages to meet it.
+  for (int i = 0; i < rule.stage_size(); ++i) {
+    // Check that the NetState contains the rule's ith stage.
+    bool has_stage = false;
+    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+      if (rule.stage(i) == state.stage(j)) { has_stage = true; }
+    }
+    if (!has_stage) {
+      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
+                << "' specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to not_stage. The NetState must
+  // contain NONE of the rule's not_stages to meet it.
+  for (int i = 0; i < rule.not_stage_size(); ++i) {
+    // Check that the NetState contains the rule's ith not_stage.
+    bool has_stage = false;
+    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+      if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
+    }
+    if (has_stage) {
+      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
+                << "' specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  return true;
+}
+
+// Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
+// layer_id == -1, tops have layer_id >= 0.)
+template <typename Dtype>
+void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
+                           const int top_id, set<string>* available_blobs,
+                           map<string, int>* blob_name_to_idx) {
+  shared_ptr<LayerParameter> layer_param((layer_id >= 0) ?
+    (new LayerParameter(param.layer(layer_id))) : NULL);
+  const string& blob_name = layer_param ?
+      (layer_param->top_size() > top_id ?
+          layer_param->top(top_id) : "(automatic)") : param.input(top_id);
+  // Check if we are doing in-place computation
+  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
+      blob_name == layer_param->bottom(top_id)) {
+    // In-place computation
+    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
+    top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
+    top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
+  } else if (blob_name_to_idx &&
+             blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
+    // If we are not doing in-place computation but have duplicated blobs,
+    // raise an error.
+    LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
+  } else {
+    // Normal output.
+    if (layer_param) {
+      LOG(INFO) << layer_param->name() << " -> " << blob_name;
+    } else {
+      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
+    }
+    shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
+    const int blob_id = blobs_.size();
+    blobs_.push_back(blob_pointer);
+    blob_names_.push_back(blob_name);
+    blob_need_backward_.push_back(false);
+    if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; }
+    if (layer_id == -1) {
+      // Set the (explicitly specified) dimensions of the input blob.
+      if (param.input_dim_size() > 0) {
+        blob_pointer->Reshape(param.input_dim(top_id * 4),
+                              param.input_dim(top_id * 4 + 1),
+                              param.input_dim(top_id * 4 + 2),
+                              param.input_dim(top_id * 4 + 3));
+      } else {
+        blob_pointer->Reshape(param.input_shape(top_id));
+      }
+      net_input_blob_indices_.push_back(blob_id);
+      net_input_blobs_.push_back(blob_pointer.get());
+    } else {
+      top_id_vecs_[layer_id].push_back(blob_id);
+      top_vecs_[layer_id].push_back(blob_pointer.get());
+    }
+  }
+  if (available_blobs) { available_blobs->insert(blob_name); }
+}
+
+// Helper for Net::Init: add a new bottom blob to the net.
+template <typename Dtype>
+int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
+    const int bottom_id, set<string>* available_blobs,
+    map<string, int>* blob_name_to_idx) {
+  const LayerParameter& layer_param = param.layer(layer_id);
+  const string& blob_name = layer_param.bottom(bottom_id);
+  if (available_blobs->find(blob_name) == available_blobs->end()) {
+    LOG(FATAL) << "Unknown blob input " << blob_name
+               << " (at index " << bottom_id << ") to layer " << layer_id;
+  }
+  const int blob_id = (*blob_name_to_idx)[blob_name];
+  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
+  bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
+  bottom_id_vecs_[layer_id].push_back(blob_id);
+  available_blobs->erase(blob_name);
+  bool propagate_down = true;
+  // Check if the backpropagation on bottom_id should be skipped
+  if (layer_param.propagate_down_size() > 0)
+    propagate_down = layer_param.propagate_down(bottom_id);
+  const bool need_backward = blob_need_backward_[blob_id] &&
+                          propagate_down;
+  bottom_need_backward_[layer_id].push_back(need_backward);
+  return blob_id;
+}
+
+template <typename Dtype>
+void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
+                             const int param_id) {
+  const LayerParameter& layer_param = layers_[layer_id]->layer_param();
+  const int param_size = layer_param.param_size();
+  string param_name =
+      (param_size > param_id) ? layer_param.param(param_id).name() : "";
+  if (param_name.size()) {
+    param_display_names_.push_back(param_name);
+  } else {
+    ostringstream param_display_name;
+    param_display_name << param_id;
+    param_display_names_.push_back(param_display_name.str());
+  }
+  const int net_param_id = params_.size();
+  params_.push_back(layers_[layer_id]->blobs()[param_id]);
+  param_id_vecs_[layer_id].push_back(net_param_id);
+  param_layer_indices_.push_back(make_pair(layer_id, param_id));
+  if (!param_size || !param_name.size() || (param_name.size() &&
+      param_names_index_.find(param_name) == param_names_index_.end())) {
+    // This layer "owns" this parameter blob -- it is either anonymous
+    // (i.e., not given a param_name) or explicitly given a name that we
+    // haven't already seen.
+    param_owners_.push_back(-1);
+    if (param_name.size()) {
+      param_names_index_[param_name] = net_param_id;
+    }
+  } else {
+    // Named param blob with name we've seen before: share params
+    const int owner_net_param_id = param_names_index_[param_name];
+    param_owners_.push_back(owner_net_param_id);
+    const pair<int, int>& owner_index =
+        param_layer_indices_[owner_net_param_id];
+    const int owner_layer_id = owner_index.first;
+    const int owner_param_id = owner_index.second;
+    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
+              << "layer '" << layer_names_[owner_layer_id] << "', param "
+              << "index " << owner_param_id;
+    Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
+    Blob<Dtype>* owner_blob =
+        layers_[owner_layer_id]->blobs()[owner_param_id].get();
+    const int param_size = layer_param.param_size();
+    if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
+                                  ParamSpec_DimCheckMode_PERMISSIVE)) {
+      // Permissive dimension checking -- only check counts are the same.
+      CHECK_EQ(this_blob->count(), owner_blob->count())
+          << "Shared parameter blobs must have the same count.";
+    } else {
+      // Strict dimension checking -- all dims must be the same.
+      CHECK(this_blob->shape() == owner_blob->shape());
+    }
+    layers_[layer_id]->blobs()[param_id]->ShareData(
+        *layers_[owner_layer_id]->blobs()[owner_param_id]);
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::GetLearningRateAndWeightDecay() {
+  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
+  ParamSpec default_param_spec;
+  for (int i = 0; i < layers_.size(); ++i) {
+    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
+    for (int j = 0; j < layer_blobs.size(); ++j) {
+      const ParamSpec* param_spec =
+          (layers_[i]->layer_param().param_size() > j) ?
+          &layers_[i]->layer_param().param(j) : &default_param_spec;
+      params_lr_.push_back(param_spec->lr_mult());
+      params_weight_decay_.push_back(param_spec->decay_mult());
+    }
+  }
+}
+
+template <typename Dtype>
+Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
+  CHECK_GE(start, 0);
+  CHECK_LT(end, layers_.size());
+  Dtype loss = 0;
+  if (debug_info_) {
+    for (int i = 0; i < net_input_blobs_.size(); ++i) {
+      InputDebugInfo(i);
+    }
+  }
+  for (int i = start; i <= end; ++i) {
+    // LOG(ERROR) << "Forwarding " << layer_names_[i];
+//Yibing add for porting
+   printf("Forwarding %s\n",layer_names_[i].c_str());
+   Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
+    loss += layer_loss;
+    if (debug_info_) { ForwardDebugInfo(i); }
+//Yibing add for porting
+    clFinish(amdDevice.CommandQueue);
+  }
+  return loss;
+}
+
+template <typename Dtype>
+Dtype Net<Dtype>::ForwardFrom(int start) {
+  return ForwardFromTo(start, layers_.size() - 1);
+}
+
+template <typename Dtype>
+Dtype Net<Dtype>::ForwardTo(int end) {
+  return ForwardFromTo(0, end);
+}
+
+template <typename Dtype>
+const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
+  if (loss != NULL) {
+    *loss = ForwardFromTo(0, layers_.size() - 1);
+  } else {
+    ForwardFromTo(0, layers_.size() - 1);
+  }
+  return net_output_blobs_;
+}
+
+template <typename Dtype>
+const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
+    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
+  // Copy bottom to internal bottom
+  for (int i = 0; i < bottom.size(); ++i) {
+    net_input_blobs_[i]->CopyFrom(*bottom[i]);
+  }
+  return ForwardPrefilled(loss);
+}
+
+template <typename Dtype>
+string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
+  BlobProtoVector blob_proto_vec;
+  if (net_input_blobs_.size()) {
+    blob_proto_vec.ParseFromString(input_blob_protos);
+    CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
+        << "Incorrect input size.";
+    for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
+      net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
+    }
+  }
+  ForwardPrefilled(loss);
+  blob_proto_vec.Clear();
+  for (int i = 0; i < net_output_blobs_.size(); ++i) {
+    net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
+  }
+  string output;
+  blob_proto_vec.SerializeToString(&output);
+  return output;
+}
+
+template <typename Dtype>
+void Net<Dtype>::BackwardFromTo(int start, int end) {
+  CHECK_GE(end, 0);
+  CHECK_LT(start, layers_.size());
+  for (int i = start; i >= end; --i) {
+    if (layer_need_backward_[i]) {
+//Yibing add for porting
+      printf("Backwarding %s\n",layer_names_[i].c_str());
+      layers_[i]->Backward(
+          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
+      if (debug_info_) { BackwardDebugInfo(i); }
+//Yibing add for porting
+    clFinish(amdDevice.CommandQueue);
+    }
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::InputDebugInfo(const int input_id) {
+  const Blob<Dtype>& blob = *net_input_blobs_[input_id];
+  const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
+  const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+  LOG(INFO) << "    [Forward] "
+     << "Input " << blob_name << " data: " << data_abs_val_mean;
+}
+
+template <typename Dtype>
+void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
+  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
+    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Forward] "
+       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
+       << " data: " << data_abs_val_mean;
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+       ++param_id) {
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const int net_param_id = param_id_vecs_[layer_id][param_id];
+    const string& blob_name = param_display_names_[net_param_id];
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Forward] "
+       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
+       << " data: " << data_abs_val_mean;
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
+  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
+  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
+    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
+    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
+    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+        << " diff: " << diff_abs_val_mean;
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+       ++param_id) {
+    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+        << " diff: " << diff_abs_val_mean;
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::UpdateDebugInfo(const int param_id) {
+  const Blob<Dtype>& blob = *params_[param_id];
+  const int param_owner = param_owners_[param_id];
+  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
+  const string& param_display_name = param_display_names_[param_id];
+  const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+  if (param_owner < 0) {
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param " << param_display_name
+        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
+  } else {
+    const string& owner_layer_name =
+        layer_names_[param_layer_indices_[param_owner].first];
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param blob " << param_display_name
+        << " (owned by layer " << owner_layer_name << ", "
+        << "param " << param_display_names_[param_owners_[param_id]] << ")"
+        << " diff: " << diff_abs_val_mean;
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
+  int num_source_layers = other->layers().size();
+  for (int i = 0; i < num_source_layers; ++i) {
+    Layer<Dtype>* source_layer = other->layers()[i].get();
+    const string& source_layer_name = other->layer_names()[i];
+    int target_layer_id = 0;
+    while (target_layer_id != layer_names_.size() &&
+        layer_names_[target_layer_id] != source_layer_name) {
+      ++target_layer_id;
+    }
+    if (target_layer_id == layer_names_.size()) {
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+      continue;
+    }
+    DLOG(INFO) << "Copying source layer " << source_layer_name;
+    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
+        layers_[target_layer_id]->blobs();
+    CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
+        << "Incompatible number of blobs for layer " << source_layer_name;
+    for (int j = 0; j < target_blobs.size(); ++j) {
+      Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
+      CHECK(target_blobs[j]->shape() == source_blob->shape());
+      target_blobs[j]->ShareData(*source_blob);
+    }
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::BackwardFrom(int start) {
+  BackwardFromTo(start, 0);
+}
+
+template <typename Dtype>
+void Net<Dtype>::BackwardTo(int end) {
+  BackwardFromTo(layers_.size() - 1, end);
+}
+
+template <typename Dtype>
+void Net<Dtype>::Backward() {
+  BackwardFromTo(layers_.size() - 1, 0);
+  if (debug_info_) {
+    Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
+    for (int i = 0; i < params_.size(); ++i) {
+      if (param_owners_[i] >= 0) { continue; }
+      asum_data += params_[i]->asum_data();
+      asum_diff += params_[i]->asum_diff();
+      sumsq_data += params_[i]->sumsq_data();
+      sumsq_diff += params_[i]->sumsq_diff();
+    }
+    const Dtype l2norm_data = std::sqrt(sumsq_data);
+    const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+    LOG(ERROR) << "    [Backward] All net params (data, diff): "
+        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::Reshape() {
+  for (int i = 0; i < layers_.size(); ++i) {
+    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
+  int num_source_layers = param.layer_size();
+  for (int i = 0; i < num_source_layers; ++i) {
+    const LayerParameter& source_layer = param.layer(i);
+    const string& source_layer_name = source_layer.name();
+    int target_layer_id = 0;
+    while (target_layer_id != layer_names_.size() &&
+        layer_names_[target_layer_id] != source_layer_name) {
+      ++target_layer_id;
+    }
+    if (target_layer_id == layer_names_.size()) {
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+      continue;
+    }
+    DLOG(INFO) << "Copying source layer " << source_layer_name;
+    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
+        layers_[target_layer_id]->blobs();
+    CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
+        << "Incompatible number of blobs for layer " << source_layer_name;
+    for (int j = 0; j < target_blobs.size(); ++j) {
+      const bool kReshape = false;
+      target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
+    }
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
+  NetParameter param;
+  ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
+  CopyTrainedLayersFrom(param);
+}
+
+template <typename Dtype>
+void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
+  param->Clear();
+  param->set_name(name_);
+  // Add bottom and top
+  for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
+    param->add_input(blob_names_[net_input_blob_indices_[i]]);
+  }
+  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
+  for (int i = 0; i < layers_.size(); ++i) {
+    LayerParameter* layer_param = param->add_layer();
+    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
+      layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
+    }
+    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
+      layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
+    }
+    layers_[i]->ToProto(layer_param, write_diff);
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::Update() {
+  // First, accumulate the diffs of any shared parameters into their owner's
+  // diff. (Assumes that the learning rate, weight decay, etc. have already been
+  // accounted for in the current diff.)
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] < 0) { continue; }
+    if (debug_info_) { UpdateDebugInfo(i); }
+    const int count = params_[i]->count();
+    const Dtype* this_diff;
+    Dtype* owner_diff;
+      this_diff = params_[i]->cpu_diff();
+      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();    
+
+    switch (Caffe::mode()) {
+    case Caffe::CPU:
+      this_diff = params_[i]->cpu_diff();
+      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+      caffe_add(count, this_diff, owner_diff, owner_diff);
+      break;
+    case Caffe::GPU:
+#ifndef CPU_ONLY
+      this_diff = params_[i]->gpu_diff();
+      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
+     // caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+      caffe_gpu_axpy<Dtype>(count, 1.0, this_diff, owner_diff);
+#else
+      NO_GPU;
+#endif
+      break;
+    default:
+      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+    }
+  }
+  // Now, update the owned parameters.
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] >= 0) { continue; }
+    if (debug_info_) { UpdateDebugInfo(i); }
+    params_[i]->Update();
+  }
+}
+
+template <typename Dtype>
+ bool Net<Dtype>::has_blob(const string& blob_name) const {
+  return blob_names_index_.find(blob_name) != blob_names_index_.end();
+}
+
+template <typename Dtype>
+const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
+    const string& blob_name) const {
+  shared_ptr<Blob<Dtype> > blob_ptr;
+  if (has_blob(blob_name)) {
+    blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
+  } else {
+    blob_ptr.reset((Blob<Dtype>*)(NULL));
+    LOG(WARNING) << "Unknown blob name " << blob_name;
+  }
+  return blob_ptr;
+}
+
+template <typename Dtype>
+bool Net<Dtype>::has_layer(const string& layer_name) const {
+  return layer_names_index_.find(layer_name) != layer_names_index_.end();
+}
+
+template <typename Dtype>
+const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
+    const string& layer_name) const {
+  shared_ptr<Layer<Dtype> > layer_ptr;
+  if (has_layer(layer_name)) {
+    layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
+  } else {
+    layer_ptr.reset((Layer<Dtype>*)(NULL));
+    LOG(WARNING) << "Unknown layer name " << layer_name;
+  }
+  return layer_ptr;
+}
+
+INSTANTIATE_CLASS(Net);
+
+}  // namespace caffe
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 87f746d8..33bb5ed5 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -520,7 +520,6 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
 #ifndef CPU_ONLY
     caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
         net_params[param_id]->mutable_gpu_diff());
-    CHECK_BLOB_DATA(net_params[param_id], 10, "NORM");
 #else
     NO_GPU;
 #endif
@@ -540,14 +539,6 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   string regularization_type = this->param_.regularization_type();
   Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
  
- Dtype *cpu_diff =  net_params[param_id]->mutable_cpu_diff();
-  printf("cpu diff before reg\n");
-  for(int i=0; i<10; i++)
-       printf("%f,",cpu_diff[i]);
-  printf("\n");
-
- 
-
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {
@@ -600,18 +591,6 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   default:
     LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
   }
-  CHECK_BLOB_DATA(net_params[param_id], 10, "REGU");
- cpu_diff =  net_params[param_id]->mutable_cpu_diff();
-  printf("cpu diff\n");
-  for(int i=0; i<10; i++)
-       printf("%f,",cpu_diff[i]);
-  printf("\n");
-
- cpu_diff =  temp_[param_id]->mutable_cpu_diff();
-  printf("tmp\n");
-  for(int i=0; i<10; i++)
-       printf("%f,",cpu_diff[i]);
-  printf("\n");
 }
 
 template <typename Dtype>
@@ -639,8 +618,6 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     caffe_gpu_copy(net_params[param_id]->count(),
         history_[param_id]->gpu_data(),
         net_params[param_id]->mutable_gpu_diff());
-
-CHECK_BLOB_DATA(net_params[param_id], 10, "COMPUTATE");
 #else
     NO_GPU;
 #endif

From 10f731bf09d3bf2aac96279c5322dd164dfb2d5c Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 23 Jul 2015 14:48:37 +0800
Subject: [PATCH 009/124] OpenCL porting for relu, sofmax layer

---
 include/caffe/loss_layers.hpp           | 11 +++-
 include/caffe/neuron_layers.hpp         | 21 +++++-
 include/caffe/util/ocl_wrapper.hpp      | 18 ++++-
 src/caffe/OCL_kernel.cl                 | 86 +++++++++++++++++++++---
 src/caffe/layers/dropout_layer.cpp      | 66 +++++++++++++++++--
 src/caffe/layers/relu_layer.cpp         | 49 ++++++++++++--
 src/caffe/layers/softmax_loss_layer.cpp | 88 +++++++++++++++++++++++--
 src/caffe/util/math_functions.cpp       |  6 ++
 src/caffe/util/ocl_wrapper.cpp          | 70 ++++++++++++++++++--
 9 files changed, 379 insertions(+), 36 deletions(-)

diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 86c34241..5aa02be1 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -695,6 +695,7 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
     */
   explicit SoftmaxWithLossLayer(const LayerParameter& param)
       : LossLayer<Dtype>(param) {}
+  ~SoftmaxWithLossLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -742,8 +743,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-
+  void ocl_setup();
+ 
   /// The internal SoftmaxLayer used to map predictions to a distribution.
   shared_ptr<Layer<Dtype> > softmax_layer_;
   /// prob stores the output probability predictions from the SoftmaxLayer.
@@ -761,6 +762,12 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   bool normalize_;
 
   int softmax_axis_, outer_num_, inner_num_;
+  
+ protected:
+   cl_kernel diff_kernel, scal_kernel, softmax_kernel;
+   cl_mem d_loss;
+   cl_kernel softmax_loss_fp_kernel;
+   cl_kernel softmax_loss_bp_kernel;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index c2e0774a..65a7e9f2 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -167,6 +167,13 @@ class DropoutLayer : public NeuronLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline const char* type() const { return "Dropout"; }
+  virtual ~DropoutLayer();
+  void ocl_setup(int bottom_count);
+  cl_mem MaskMem;
+  cl_kernel ocl_Kernel_Fwd;
+  cl_kernel ocl_Kernel_Bwd;
+  cl_kernel rng_kernel;
+
 
  protected:
   /**
@@ -420,8 +427,10 @@ class ReLULayer : public NeuronLayer<Dtype> {
    *     the value @f$ \nu @f$ by which negative values are multiplied.
    */
   explicit ReLULayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
+      : NeuronLayer<Dtype>(param) {
+        ocl_setup();
+    }
+  ~ReLULayer();
   virtual inline const char* type() const { return "ReLU"; }
 
  protected:
@@ -473,6 +482,14 @@ class ReLULayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+//OpenCL related setiup
+  void ocl_setup();
+
+ protected:
+   cl_kernel ReLUForward_kernel;
+   cl_kernel ReLUBackward_kernel;
+
 };
 
 #ifdef USE_CUDNN
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index df9e855e..519f15d4 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -43,10 +43,10 @@ template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
 
 template <typename Dtype>
-void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data);
+void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
 
 template <typename Dtype>
-void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff);
+void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
 template <typename Dtype>
 void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
@@ -62,6 +62,20 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype
 
 template <typename Dtype>
 void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y );
+
+template <typename Dtype>
+void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
+          const Dtype* prob_data, const Dtype* label, Dtype* loss,
+          const int num, const int dim, const int spatial_dim,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts);
+
+template <typename Dtype>
+void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top,
+          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+          const int spatial_dim, const bool has_ignore_label_,
+          const int ignore_label_, Dtype* counts);
+
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 8d497ced..8a5d1138 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -1098,25 +1098,25 @@ template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePo
 template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, const int pad, __global double* bottom_diff);
 
 template <class T>
-__kernel void ReLUForward(const int count, __global T* in, __global T* out){
+__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
 	int index = get_global_id(0);
 	if(index < count)
-		out[index] = in[index] > 0? in[index]:0;
+		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
 }
 
-//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out);
-template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out);
-template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out);
+//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
 
 template <class T>
-__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff){
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
 	int index = get_global_id(0);
         if(index < count)
-		out_diff[index] = in_diff[index] * (in_data[index] > 0);
+		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
 }
 
-template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff);
-template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff);
+template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
 
 template <class T>
 __kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
@@ -1193,6 +1193,74 @@ __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label
 template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
 template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
 
+template <class T>
+__kernel void SoftmaxLossForwardGPU(const int nthreads,
+          __global T* prob_data, __global T* label,__global T* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global T* counts) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+        const int n = index / spatial_dim;
+        const int s = index % spatial_dim;
+        const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+           loss[index] = 0;
+           counts[index] = 0;
+        } else {
+           loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+                      T(FLT_MIN)));
+        counts[index] = 1;
+    }
+  }
+}
+
+template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+          __global float* prob_data, __global float* label,__global float* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global float* counts);
+template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+          __global double* prob_data, __global double* label,__global double* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global double* counts);
+
+template <class T>
+__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
+          __global T* label,__global T* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, T* counts) {
+    const int channels = dim / spatial_dim;
+   int index  = get_global_id(0);
+   if(index <  nthreads) {
+       const int n = index / spatial_dim;
+       const int s = index % spatial_dim;
+       const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+
+      if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int c = 0; c < channels; ++c) {
+              bottom_diff[n * dim + c * spatial_dim + s] = 0;
+          }
+          counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
+}
+
+
+template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
+          __global float* label,__global float* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, float* counts);
+
+template __attribute__ ((mangled_name(softmax_loss_bp_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+          __global double* label,__global double* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, double* counts);
+
 
 template <class T>
 __kernel void diff (const int num, const int dim, __global T* data, __global T* label){
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 4239443d..7799950e 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -10,6 +10,27 @@
 
 namespace caffe {
 
+template <typename Dtype>
+void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
+    //create OpenCL related cl_mem objects and kernels
+    //if(Caffe::mode() == Caffe::GPU){
+    cl_int _err;
+    ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat",&_err);
+    ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat",&_err);
+    rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat",&_err);
+    OCL_CHECK(_err);
+    MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL);
+}
+
+template <typename Dtype>
+DropoutLayer<Dtype>::~DropoutLayer(){
+   OCL_CHECK( clReleaseMemObject(MaskMem) );
+   OCL_CHECK( clReleaseKernel(ocl_Kernel_Fwd) );
+   OCL_CHECK( clReleaseKernel(ocl_Kernel_Bwd) );
+   OCL_CHECK( clReleaseKernel(rng_kernel) );
+}
+
+
 template <typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -19,6 +40,7 @@ void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   DCHECK(threshold_ < 1.);
   scale_ = 1. / (1. - threshold_);
   uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+  ocl_setup(bottom[0]->count());
 }
 
 template <typename Dtype>
@@ -69,14 +91,50 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-     Forward_cpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  if (this->phase_ == TRAIN) {
+    unsigned int* mask =
+        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+//    caffe_gpu_rng_uniform(count, mask);
+ 
+     caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
+    Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+
+    // set thresholds
+    // NOLINT_NEXT_LINE(whitespace/operators)
+//    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+  //      count, bottom_data, mask, uint_thres_, scale_, top_data);
+   // CUDA_POST_KERNEL_CHECK;
+  } else {
+    caffe_gpu_copy(count, bottom_data, top_data);
+  }
 }
 
+
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    Backward_cpu(top, propagate_down, bottom);
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    if (this->phase_ == TRAIN) {
+      const unsigned int* mask =
+          static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      const int count = bottom[0]->count();
+      // NOLINT_NEXT_LINE(whitespace/operators)
+     // DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
+       // CAFFE_CUDA_NUM_THREADS>>>(
+         // count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+    //  CUDA_POST_KERNEL_CHECK;
+       Dropout_bp_gpu(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
+    } else {
+      caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
 }
 
 
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index ce85b1cc..d7b0a838 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -5,6 +5,20 @@
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
+template <typename Dtype>
+void ReLULayer<Dtype>::ocl_setup(){
+    cl_int _err=0;
+    ReLUForward_kernel = clCreateKernel(amdDevice.Program,"ReLUForwardfloat",&_err);
+    ReLUBackward_kernel = clCreateKernel(amdDevice.Program,"ReLUBackwardfloat",&_err);
+}
+
+template <typename Dtype>
+ReLULayer<Dtype>::~ReLULayer(){
+  OCL_CHECK( clReleaseKernel(ReLUForward_kernel) );
+  OCL_CHECK( clReleaseKernel(ReLUBackward_kernel) );
+}
+
+
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
@@ -36,16 +50,43 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-    Forward_cpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+ // ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+   //   count, bottom_data, top_data, negative_slope);
+  //CUDA_POST_KERNEL_CHECK;
+  // << " count: " << count << " bottom_data: "
+  //     << (unsigned long)bottom_data
+  //     << " top_data: " << (unsigned long)top_data
+  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
+  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
+ Relu_fp_gpu(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
 }
 
+
 template <typename Dtype>
 void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    Backward_cpu(top, propagate_down, bottom);
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+//    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+  //      count, top_diff, bottom_data, bottom_diff, negative_slope);
+   // CUDA_POST_KERNEL_CHECK;
+   Relu_bp_gpu(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
+  }
 }
 
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 072f9f71..4b091d3a 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -28,6 +28,28 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
     ignore_label_ = this->layer_param_.loss_param().ignore_label();
   }
   normalize_ = this->layer_param_.loss_param().normalize();
+
+  ocl_setup();
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::ocl_setup(){
+   cl_int err=0;
+   scal_kernel = clCreateKernel(amdDevice.Program, "scal_float", &err);
+   diff_kernel = clCreateKernel(amdDevice.Program, "diff_float", &err);
+   softmax_kernel = clCreateKernel(amdDevice.Program, "softmax_float", &err);
+   d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL);
+
+   softmax_loss_fp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_fp_float", &err);
+   softmax_loss_bp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_bp_float", &err);
+}
+
+template <typename Dtype>
+SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer(){
+  clReleaseKernel(diff_kernel);
+  clReleaseKernel(scal_kernel);
+  clReleaseKernel(softmax_loss_fp_kernel);
+  clReleaseKernel(softmax_loss_bp_kernel);
 }
 
 template <typename Dtype>
@@ -121,19 +143,71 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-     Forward_cpu(bottom, top);
+void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.gpu_data();
+  const Dtype* label = bottom[1]->gpu_data();
+  const int dim = prob_.count() / outer_num_;
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // Similarly, this memory is never used elsewhere, and thus we can use it
+  // to avoid having to allocate additional GPU memory.
+  Dtype* counts = prob_.mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SoftmaxLossForwardGPU<Dtype>(softmax_loss_fp_kernel, nthreads, prob_data, label, loss_data,
+       outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+  Dtype loss;
+  caffe_gpu_asum(nthreads, loss_data, &loss);
+  if (normalize_) {
+    Dtype count;
+    caffe_gpu_asum(nthreads, counts, &count);
+    loss /= count;
+  } else {
+    loss /= outer_num_;
+  }
+  top[0]->mutable_cpu_data()[0] = loss;
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    Backward_cpu(top, propagate_down, bottom);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* prob_data = prob_.gpu_data();
+    const Dtype* top_data = top[0]->gpu_data();
+   // caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->gpu_data();
+    const int dim = prob_.count() / outer_num_;
+    const int nthreads = outer_num_ * inner_num_;
+    // Since this memory is never used for anything else,
+    // we use to to avoid allocating new GPU memory.
+    Dtype* counts = prob_.mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SoftmaxLossBackwardGPU<Dtype>(softmax_loss_bp_kernel, nthreads, top_data, label, bottom_diff,
+           outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    if (normalize_) {
+      Dtype count;
+      caffe_gpu_asum(nthreads, counts, &count);
+      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+    }
+  }
 }
 
-
-
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index cf9b1ca5..11ccbcc2 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -536,6 +536,12 @@ double caffe_cpu_asum<double>(const int n, const double* x) {
 
 template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
+    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_float)), NULL, NULL);
+    cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_float)), NULL, NULL);
+    clblasSasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
+    clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,0, NULL, NULL);
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(d_y);
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 32a477fc..1fd48aa7 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -105,6 +105,62 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p
 template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss);
 template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
+
+template <typename Dtype>
+void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
+          const Dtype* prob_data, const Dtype* label, Dtype* loss,
+          const int num, const int dim, const int spatial_dim,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts)
+{
+    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
+    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&prob_data));
+    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
+    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem),  (void*)&loss));
+    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),  (void*)&num));
+    OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int),  (void*)&dim));
+    OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int),  (void*)&spatial_dim));
+    OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool),  (void*)&has_ignore_label_));
+    OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
+    OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
+    
+   size_t Global_Work_Size[1] = {nthreads};
+   size_t Local_Work_Size[1] = {256};
+   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossForwardGPU<float>(cl_kernel Kernel, const int nthreads, const float* prob_data, const float* label, float* loss,
+          const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts);
+template void SoftmaxLossForwardGPU<double>(cl_kernel Kernel, const int nthreads, const double* prob_data, const double* label, double* loss,
+          const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts);
+
+template <typename Dtype>
+void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top,
+          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+          const int spatial_dim, const bool has_ignore_label_,
+          const int ignore_label_, Dtype* counts)
+{
+    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
+    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&top));
+    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
+    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem),  (void*)&bottom_diff));
+    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),  (void*)&num));
+    OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int),  (void*)&dim));
+    OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int),  (void*)&spatial_dim));
+    OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool),  (void*)&has_ignore_label_));
+    OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
+    OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
+
+   size_t Global_Work_Size[1] = {nthreads};
+   size_t Local_Work_Size[1] = {256};
+   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossBackwardGPU<float>(cl_kernel Kernel, const int nthreads, const float* top, const float* label, float* bottom_diff, 
+                       const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts);
+template void SoftmaxLossBackwardGPU<double>(cl_kernel Kernel, const int nthreads, const double* top, const double* label, double* bottom_diff, 
+                       const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts);
+
 template <typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
@@ -237,35 +293,37 @@ template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const fl
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
 template <typename Dtype> 
-void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data){
+void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
     size_t Global_Work_Size[] = {count * 1};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void Relu_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data);
-template void Relu_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data);
+template void Relu_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope);
+template void Relu_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope);
 
 template <typename Dtype> 
-void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){
+void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
     size_t Global_Work_Size[] = {count * 1};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void Relu_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff);
-template void Relu_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff);
+template void Relu_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
+template void Relu_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
 
 template <typename Dtype>
 void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){

From fc4fa9bdc5a8fa9210a4a1261c750a2e44f629e3 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 16 Jul 2015 16:37:40 +0800
Subject: [PATCH 010/124] OpenCL porting of pooling layer

---
 include/caffe/util/ocl_wrapper.hpp |  21 ++
 include/caffe/vision_layers.hpp    |  13 ++
 src/caffe/OCL_kernel.cl            | 312 +++++++++++++++++++++--------
 src/caffe/layers/pooling_layer.cpp | 134 ++++++++++++-
 src/caffe/util/ocl_wrapper.cpp     | 193 ++++++++++++++++++
 5 files changed, 585 insertions(+), 88 deletions(-)

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 519f15d4..49afbffe 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -33,9 +33,30 @@ void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const
 template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data);
 
+template <typename Dtype>
+void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask);
+
+template <typename Dtype>
+void MaxPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
+
+template <typename Dtype>
+void AvePoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
+
+template <typename Dtype>
+ void StoPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff);
+
 template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data);
 
+template <typename Dtype>
+void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data);
+
 template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff );
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 21c72bba..75701710 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -405,10 +405,12 @@ class PoolingLayer : public Layer<Dtype> {
  public:
   explicit PoolingLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
+  ~PoolingLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  void ocl_setup();
 
   virtual inline const char* type() const { return "Pooling"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
@@ -439,6 +441,17 @@ class PoolingLayer : public Layer<Dtype> {
   bool global_pooling_;
   Blob<Dtype> rand_idx_;
   Blob<int> max_idx_;
+
+//opencl related data structures
+protected:
+  cl_kernel MaxPoolForward_kernel,
+            AvePoolForward_kernel,
+            StoPoolForwardTrain_kernel,
+            StoPoolForwardTest_kernel,
+            MaxPoolBackward_kernel,
+            AvePoolBackward_kernel,
+            StoPoolBackward_kernel;
+
 };
 
 #ifdef USE_CUDNN
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 8a5d1138..00278db7 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -747,7 +747,7 @@ template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
      int gdx = get_global_id(0);
      if(gdx < N){
-          Y[gdx] =((0.0 < X[gdx])-(X[gdx] < 0.0));
+          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
      }
 }
 
@@ -966,7 +966,7 @@ template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(co
 
 
 template <class T>
-__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global T* top_data){
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
      int index = get_global_id(0);
      int tmp = get_global_size(0);
      for(index; index < nthreads; index += tmp){
@@ -974,97 +974,191 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const
          int ph = (index / pooled_width) % pooled_height;
          int c = (index / pooled_width / pooled_height) % channels;
          int n = index / pooled_width / pooled_height / channels;
-         int hstart = ph * stride;
-         int hend = min(hstart + kernel_size, height);
-         int wstart = pw * stride;
-         int wend = min(wstart + kernel_size, width);
-         T maxval = -99999999;
-         bottom_data += (n * channels + c) * height * width;
-         for (int h = hstart; h < hend; ++h) {
-           for (int w = wstart; w < wend; ++w) {
-             maxval = max(maxval, bottom_data[h * width + w]);
-           }   
-         }
-         top_data[index] = maxval;
-     }
+         int hstart = ph * stride_h - pad_h;
+         int wstart = pw * stride_w - pad_w;
+         const int hend = min(hstart + kernel_h, height);
+         const int wend = min(wstart + kernel_w, width);
+ 	 hstart = max(hstart, 0);
+    	 wstart = max(wstart, 0);
+    	T maxval = -FLT_MAX;
+    	int maxidx = -1;
+    	bottom_slice =
+        bottom_data + (n * channels + c) * height * width;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+           if (bottom_slice[h * width + w] > maxval) {
+             maxidx = h * width + w;
+             maxval = bottom_slice[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask) {
+      mask[index] = maxidx;
+    } else {
+      top_mask[index] = maxidx;
+    }
+  }
 
 }
-template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* top_data);
-template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, __global double* top_data);
+template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
+template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
 
 
 template <class T>
-__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* top_data){
-    int index=get_global_id(0);
-    int tmp=get_global_size(0);
-    for(index;index<nthreads;index+=tmp){
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < nthreads; index+=tmp){
         int pw = index % pooled_width;
         int ph = (index / pooled_width) % pooled_height;
         int c = (index / pooled_width / pooled_height) % channels;
         int n = index / pooled_width / pooled_height / channels;
-        int hstart = ph * stride - pad;
-        int wstart = pw * stride - pad;
-        int hend = min(hstart + kernel_size, height + pad);
-        int wend = min(wstart + kernel_size, width + pad);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        hend = min(hend, height);
-        wend = min(wend, width);
-        T aveval = 0;
-        bottom_data += (n * channels + c) * height * width;
-        for (int h = hstart; h < hend; ++h) {
-          for (int w = wstart; w < wend; ++w) {
-            aveval += bottom_data[h * width + w];
-          }
-        }
-        top_data[index] = aveval / pool_size;
-    }
-
+	    int hstart = ph * stride_h - pad_h;
+	    int wstart = pw * stride_w - pad_w;
+	    int hend = min(hstart + kernel_h, height + pad_h);
+	    int wend = min(wstart + kernel_w, width + pad_w);
+	    const int pool_size = (hend - hstart) * (wend - wstart);
+	    hstart = max(hstart, 0);
+	    wstart = max(wstart, 0);
+	    hend = min(hend, height);
+	    wend = min(wend, width);
+	    T aveval = 0;
+	    bottom_slice =
+		bottom_data + (n * channels + c) * height * width;
+	    for (int h = hstart; h < hend; ++h) {
+	      for (int w = wstart; w < wend; ++w) {
+		aveval += bottom_slice[h * width + w];
+	      }
+	    }
+	    top_data[index] = aveval / pool_size;
+	  }
 }
-template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* top_data);
-template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, const int pad, __global double* top_data);
+template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
+template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
 
 template <class T>
-__kernel void MaxPoolBackward(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* top_diff,
-const int num, const int channels, const int height,
-const int width, const int pooled_height, const int pooled_width,
-const int kernel_size, const int stride, __global T* bottom_diff){
+__kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* idx_data, __global T* top_data){
     int index = get_global_id(0);
-    int total = get_global_size(0);
-    for(index; index < nthreads; index += total){
-        // find out the local index
-        // find out the local offset
-        int w = index % width;
-        int h = (index / width) % height;
-        int c = (index / width / height) % channels;
-        int n = index / width / height / channels;
-        int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1;
-        int phend = min(h / stride + 1, pooled_height);
-        int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1;
-        int pwend = min(w / stride + 1, pooled_width);
-        T gradient = 0;
-        T bottom_datum =
-            bottom_data[((n * channels + c) * height + h) * width + w];
-        top_data += (n * channels + c) * pooled_height * pooled_width;
-        top_diff += (n * channels + c) * pooled_height * pooled_width;
-        for (int ph = phstart; ph < phend; ++ph) {
-            for (int pw = pwstart; pw < pwend; ++pw) {
-                gradient += top_diff[ph * pooled_width + pw] *
-                    (bottom_datum == top_data[ph * pooled_width + pw]);
-            }
+    int tmp = get_global_size(0);
+    for(index; index < nthreads; index+=tmp){
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    T cumsum = 0.;
+    bottom_slice = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+      }
+    }
+    const float thres = rand_idx[index] * cumsum;
+    // Second pass: get value, and set index.
+    cumsum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+        if (cumsum >= thres) {
+          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
+          top_data[index] = bottom_slice[h * width + w];
+          return;
         }
-        bottom_diff[index] = gradient;
-
+      }
+    }
     }
+}
+template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
 
+template <class T>
+__kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < nthreads; index+=tmp){
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems
+    T cumsum = FLT_MIN;
+    T cumvalues = 0.;
+    bottom_slice =
+        bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+      }
+    }
+    top_data[index] = cumvalues / cumsum;
+  }
 }
-template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global float* bottom_diff);
-template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, __global double* bottom_diff);
+template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTestDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
+template <class T>
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int* const mask, const Dtype* const top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff) {
+     int index = get_global_id(0);
+     int total = get_global_size(0);
+     for(index; index < nthreads; index += total){
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    top_diff_slice = top_diff + offset;
+    if (mask) {
+      const int* const mask_slice = mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff_slice[ph * pooled_width + pw];
+          }
+        }
+      }
+    } else {
+      top_mask_slice = top_mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff_slice[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* float bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* float bottom_diff);
 
 template <class T>
-__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global T* bottom_diff){
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, T* const bottom_diff){
      int index = get_global_id(0);
      int total = get_global_size(0);
      for(index; index < nthreads; index += total){
@@ -1072,30 +1166,76 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in
 	    int h = (index / width) % height + pad;
 	    int c = (index / width / height) % channels;
 	    int n = index / width / height / channels;
-	    int phstart = (h < kernel_size) ? 0 : (h - kernel_size) / stride + 1;
-	    int phend = min(h / stride + 1, pooled_height);
-	    int pwstart = (w < kernel_size) ? 0 : (w - kernel_size) / stride + 1;
-	    int pwend = min(w / stride + 1, pooled_width);
+	    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    	    const int phend = min(h / stride_h + 1, pooled_height);
+	    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            const int pwend = min(w / stride_w + 1, pooled_width);
 	    T gradient = 0;
 	    top_diff += (n * channels + c) * pooled_height * pooled_width;
 	    for (int ph = phstart; ph < phend; ++ph) {
 	      for (int pw = pwstart; pw < pwend; ++pw) {
 		// figure out the pooling size
-		int hstart = ph * stride - pad;
-		int wstart = pw * stride - pad;
-		int hend = min(hstart + kernel_size, height + pad);
-		int wend = min(wstart + kernel_size, width + pad);
+		int hstart = ph * stride_h - pad_h;
+		int wstart = pw * stride_w - pad_w;
+		int hend = min(hstart + kernel_h, height + pad_h);
+		int wend = min(wstart + kernel_w, width + pad_w);
 		int pool_size = (hend - hstart) * (wend - wstart);
-           gradient += top_diff[ph * pooled_width + pw] / pool_size;
-      }
+		gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
+	      }
     }
     bottom_diff[index] = gradient;
-
    }
 }
 
-template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_size, const int stride, const int pad, __global float* bottom_diff);
-template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_size, const int stride, const int pad, __global double* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+
+template <class T>
+void StoPoolBackward(const int nthreads,
+    const Dtype* const rand_idx, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, Dtype* const bottom_diff) {
+      int index = get_global_id(0);
+      int total = get_global_size(0);
+      for(index; index < nthreads; index += total){
+	    // find out the local index
+	    // find out the local offset
+	    const int w = index % width;
+	    const int h = (index / width) % height;
+	    const int c = (index / width / height) % channels;
+	    const int n = index / width / height / channels;
+	    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	    const int phend = min(h / stride_h + 1, pooled_height);
+	    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	    const int pwend = min(w / stride_w + 1, pooled_width);
+	    Dtype gradient = 0;
+	    const Dtype* const rand_idx_slice =
+		rand_idx + (n * channels + c) * pooled_height * pooled_width;
+	    const Dtype* const top_diff_slice =
+		top_diff + (n * channels + c) * pooled_height * pooled_width;
+	    for (int ph = phstart; ph < phend; ++ph) {
+	      for (int pw = pwstart; pw < pwend; ++pw) {
+		gradient += top_diff_slice[ph * pooled_width + pw] *
+		    (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
+	      }
+	    }
+	    bottom_diff[index] = gradient;
+	  }
+}
+template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel StoPoolBackward<float>(const int nthreads,
+    const float* const rand_idx, const float* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, float* const bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel StoPoolBackward<float>(const int nthreads,
+    const double* const rand_idx, const double* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, double* const bottom_diff);
 
 template <class T>
 __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 97a5c150..a53002dd 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -13,6 +13,17 @@ namespace caffe {
 using std::min;
 using std::max;
 
+template <typename Dtype>
+PoolingLayer<Dtype>::~PoolingLayer(){
+  OCL_CHECK( clReleaseKernel(MaxPoolForward_kernel) );
+  OCL_CHECK( clReleaseKernel(AvePoolForward_kernel) );
+  OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) );
+  OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) );
+  OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) );
+  OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) );
+  OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) );
+}
+
 template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -76,6 +87,19 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     CHECK_LT(pad_h_, kernel_h_);
     CHECK_LT(pad_w_, kernel_w_);
   }
+  //Intialize OpenCL related
+  ocl_setup();
+}
+
+template <typename Dtype>
+ void PoolingLayer<Dtype>::ocl_setup(){
+  MaxPoolForward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolForwardfloat", NULL);
+  AvePoolForward_kernel = clCreateKernel(amdDevice.Program, "AvePoolForwardfloat", NULL);
+  StoPoolForwardTrain_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTrainfloat", NULL);
+  StoPoolForwardTest_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTestfloat", NULL);
+  MaxPoolBackward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolBackwardfloat", NULL);
+  AvePoolBackward_kernel = clCreateKernel(amdDevice.Program, "AvePoolBackwardfloat", NULL);
+  StoPoolBackward_kernel = clCreateKernel(amdDevice.Program, "StoPoolBackwardfloat", NULL);
 }
 
 template <typename Dtype>
@@ -312,13 +336,119 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
-    Forward_cpu(bottom, top);
+    //Forward_cpu(bottom, top);
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int count = top[0]->count();
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  int* mask = NULL;
+  Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->mutable_gpu_data();
+    } else {
+      mask = max_idx_.mutable_gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolForward(MaxPoolForward_kernel,
+        count, bottom_data, bottom[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
+        mask, top_mask);
+   /* 
+   // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, bottom[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
+        mask, top_mask);*/
+    break;
+ case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolForward(AvePoolForward_kernel,
+        count, bottom_data, bottom[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
+ /*
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, bottom[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);*/
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    if (this->phase_ == TRAIN) {
+      // We need to create the random index as well.
+      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
+                            rand_idx_.mutable_gpu_data());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTrain(StoPoolForwardTrain_kernel,
+          count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_,
+          kernel_w_, stride_h_, stride_w_,
+          rand_idx_.mutable_gpu_data(), top_data);
+    } else {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTest(StoPoolForwardTest_kernel,
+          count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_,
+          kernel_w_, stride_h_, stride_w_, top_data);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    Backward_cpu(top, propagate_down, bottom);
+    //Backward_cpu(top, propagate_down, bottom);
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int count = bottom[0]->count();
+  caffe_gpu_set(count, Dtype(0.), bottom_diff);
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  const int* mask = NULL;
+  const Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->gpu_data();
+    } else {
+      mask = max_idx_.gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolBackward(MaxPoolBackward_kernel,
+        count, top_diff, mask, top_mask, top[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_,
+        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
+        bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolBackward(AvePoolBackward_kernel,
+        count, top_diff, top[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+     StoPoolBackward(StoPoolBackward_kernel,
+        count, rand_idx_.gpu_data(), top_diff,
+        top[0]->num(), channels_, height_, width_, pooled_height_,
+        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
+        bottom_diff);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 1fd48aa7..b47a0a91 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -216,6 +216,115 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
 template  void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data);
 template  void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data);
 
+template <typename Dtype>
+void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
+    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
+    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
+    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_);
+    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_);
+    ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*)&mask);
+    ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolForward<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask);
+template void MaxPoolForward<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask);
+
+template <typename Dtype>
+void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
+    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
+    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
+    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&idx_data);
+    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolForwardTrain<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+template void StoPoolForwardTrain<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
+    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
+    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
+    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+template void StoPoolForwardTest<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data);
+template void StoPoolForwardTest<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data);
+
+template <typename Dtype>
+void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
+    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
+    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
+    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_);
+    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_);
+    ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {count * 1};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolForward<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data);
+template void AvePoolForward<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data);
+
 template <typename Dtype> 
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){
     cl_int ret;
@@ -267,6 +376,90 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
 template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff);
 template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff );
 
+template <typename Dtype>
+void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&mask);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_mask);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_h);
+    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&kernel_w);
+    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_h);
+    ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&stride_w);
+    ret |= clSetKernelArg(Kernel,14, sizeof(cl_int), (void*)&pad_h);
+    ret |= clSetKernelArg(Kernel,15, sizeof(cl_int), (void*)&pad_w);
+    ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
+template void MaxPoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
+
+template <typename Dtype>
+void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff)
+{
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&num);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&stride_h);
+    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_w);
+    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&pad_h);
+    ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&pad_w);
+    ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
+template void AvePoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
+
+template <typename Dtype>
+void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&channels);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&height);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&width);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_height);
+    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_width);
+    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_h);
+    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_w);
+    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_h);
+    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_w);
+    ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff);
+template void StoPoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff);
+
 template <typename Dtype> 
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){
     cl_int ret;

From e5dc1d75e1df5f35310c6e6b226ef194899d0753 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sat, 18 Jul 2015 11:58:59 +0800
Subject: [PATCH 011/124] OpenCL porting of LRN layers and inner-product layer;
 fixed some bugs in solver

---
 include/caffe/common_layers.hpp               |   1 +
 include/caffe/neuron_layers.hpp               |   4 +
 include/caffe/solver.hpp                      |  15 +
 include/caffe/util/math_functions.hpp         |   6 -
 include/caffe/util/ocl_wrapper.hpp            |  25 +-
 include/caffe/vision_layers.hpp               |   4 +-
 src/caffe/OCL_kernel.cl                       | 343 +++++++++---------
 .../layers/cufiles/inner_product_layer.cu     |  27 +-
 src/caffe/layers/inner_product_layer.cpp      |  39 +-
 src/caffe/layers/lrn_layer.cpp                |  65 +++-
 src/caffe/layers/power_layer.cpp              |  77 ++++
 src/caffe/layers/split_layer.cpp              |  20 +-
 src/caffe/solver.cpp                          |  24 +-
 src/caffe/util/math_functions.cpp             |   2 +
 src/caffe/util/ocl_wrapper.cpp                | 268 ++++++--------
 15 files changed, 560 insertions(+), 360 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index d2c0ce6d..4e884f21 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -567,6 +567,7 @@ class SplitLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int count_;
+  cl_kernel gpu_add_kernel;
 };
 
 /**
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 65a7e9f2..67d5e0b2 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -411,6 +411,10 @@ class PowerLayer : public NeuronLayer<Dtype> {
   Dtype shift_;
   /// @brief Result of @f$ \alpha \gamma @f$
   Dtype diff_scale_;
+
+ protected:
+ void ocl_setup();
+ cl_kernel memset_kernel, scalar_kernel, div_kernel, mul_kernel, powx_kernel;
 };
 
 /**
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index c2ced487..8f2767f6 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -58,6 +58,10 @@ class Solver {
   int current_step_;
   shared_ptr<Net<Dtype> > net_;
   vector<shared_ptr<Net<Dtype> > > test_nets_;
+ 
+ void ocl_setup();
+ protected:
+ cl_kernel scalar_kernel, div_kernel, powx_kernel;
 
   DISABLE_COPY_AND_ASSIGN(Solver);
 };
@@ -93,6 +97,10 @@ class SGDSolver : public Solver<Dtype> {
   //   of gradients/updates and is not needed in snapshots
   vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
 
+ void ocl_setup();
+ protected:
+ cl_kernel scalar_kernel, div_kernel, powx_kernel;
+
   DISABLE_COPY_AND_ASSIGN(SGDSolver);
 };
 
@@ -107,6 +115,10 @@ class NesterovSolver : public SGDSolver<Dtype> {
  protected:
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
 
+ void ocl_setup();
+ protected:
+ cl_kernel scalar_kernel, div_kernel, powx_kernel;
+
   DISABLE_COPY_AND_ASSIGN(NesterovSolver);
 };
 
@@ -125,6 +137,9 @@ class AdaGradSolver : public SGDSolver<Dtype> {
         << "Momentum cannot be used with AdaGrad.";
   }
 
+ void ocl_setup();
+ protected:
+ cl_kernel scalar_kernel, div_kernel, powx_kernel;
   DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
 };
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 2cbbf1f0..be1dd09f 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -97,15 +97,9 @@ void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
-void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
-
 template <typename Dtype>
 void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
-void caffe_gpu_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
-
 template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 49afbffe..0390ee3f 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -96,7 +96,30 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t
           const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
           const int spatial_dim, const bool has_ignore_label_,
           const int ignore_label_, Dtype* counts);
+}
 
-}  // namespace caffe
+template <typename Dtype>
+void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data);
+
+template <typename Dtype>
+void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha_over_size,
+    const Dtype k, Dtype* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in,
+    const Dtype* const scale, const Dtype negative_beta, Dtype* const out);
 
+template <typename Dtype>
+void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
+    const Dtype* const bottom_data, const Dtype* const top_data,
+    const Dtype* const scale, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio, Dtype* const bottom_diff);
+  // namespace caffe
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 75701710..b46130e8 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -392,10 +392,12 @@ class LRNLayer : public Layer<Dtype> {
   shared_ptr<EltwiseLayer<Dtype> > product_layer_;
   Blob<Dtype> product_input_;
   vector<Blob<Dtype>*> product_bottom_vec_;
+
+  cl_kernel LFSkernel, LCDkernel, LCOkernel;
 };
 
 
-/**
+/*n
  * @brief Pools the input image by taking the max, average, etc. within regions.
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 00278db7..0d8328c8 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -966,7 +966,7 @@ template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(co
 
 
 template <class T>
-__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
      int index = get_global_id(0);
      int tmp = get_global_size(0);
      for(index; index < nthreads; index += tmp){
@@ -978,17 +978,17 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const
          int wstart = pw * stride_w - pad_w;
          const int hend = min(hstart + kernel_h, height);
          const int wend = min(wstart + kernel_w, width);
- 	 hstart = max(hstart, 0);
-    	 wstart = max(wstart, 0);
-    	T maxval = -FLT_MAX;
-    	int maxidx = -1;
-    	bottom_slice =
+         hstart = max(hstart, 0);
+         wstart = max(wstart, 0);
+        T maxval = -FLT_MAX;
+        int maxidx = -1;
+        bottom_data =
         bottom_data + (n * channels + c) * height * width;
         for (int h = hstart; h < hend; ++h) {
           for (int w = wstart; w < wend; ++w) {
-           if (bottom_slice[h * width + w] > maxval) {
+           if (bottom_data[h * width + w] > maxval) {
              maxidx = h * width + w;
-             maxval = bottom_slice[maxidx];
+             maxval = bottom_data[maxidx];
         }
       }
     }
@@ -999,11 +999,9 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const
       top_mask[index] = maxidx;
     }
   }
-
 }
 template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
-template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
-
+template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
 
 template <class T>
 __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
@@ -1013,32 +1011,31 @@ __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const
         int pw = index % pooled_width;
         int ph = (index / pooled_width) % pooled_height;
         int c = (index / pooled_width / pooled_height) % channels;
-        int n = index / pooled_width / pooled_height / channels;
-	    int hstart = ph * stride_h - pad_h;
-	    int wstart = pw * stride_w - pad_w;
-	    int hend = min(hstart + kernel_h, height + pad_h);
-	    int wend = min(wstart + kernel_w, width + pad_w);
-	    const int pool_size = (hend - hstart) * (wend - wstart);
-	    hstart = max(hstart, 0);
-	    wstart = max(wstart, 0);
-	    hend = min(hend, height);
-	    wend = min(wend, width);
-	    T aveval = 0;
-	    bottom_slice =
-		bottom_data + (n * channels + c) * height * width;
-	    for (int h = hstart; h < hend; ++h) {
-	      for (int w = wstart; w < wend; ++w) {
-		aveval += bottom_slice[h * width + w];
-	      }
-	    }
-	    top_data[index] = aveval / pool_size;
-	  }
+        int n = index / pooled_width / pooled_height / channels;            int hstart = ph * stride_h - pad_h;            int wstart = pw * stride_w - pad_w;
+            int hend = min(hstart + kernel_h, height + pad_h);
+            int wend = min(wstart + kernel_w, width + pad_w);
+            const int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, height);
+            wend = min(wend, width);
+            T aveval = 0;
+            bottom_data =
+                bottom_data + (n * channels + c) * height * width;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                aveval += bottom_data[h * width + w];
+              }
+            }
+            top_data[index] = aveval / pool_size;
+          }
+
 }
 template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
 template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
 
 template <class T>
-__kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* idx_data, __global T* top_data){
+__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){
     int index = get_global_id(0);
     int tmp = get_global_size(0);
     for(index; index < nthreads; index+=tmp){
@@ -1051,11 +1048,11 @@ __kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const D
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     T cumsum = 0.;
-    bottom_slice = bottom_data + (n * channels + c) * height * width;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
     // First pass: get sum
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
+        cumsum += bottom_data[h * width + w];
       }
     }
     const float thres = rand_idx[index] * cumsum;
@@ -1063,25 +1060,25 @@ __kernel void void StoPoolForwardTrain(cl_kernel Kernel,const int count, const D
     cumsum = 0;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
+        cumsum += bottom_data[h * width + w];
         if (cumsum >= thres) {
           rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_slice[h * width + w];
+          top_data[index] = bottom_data[h * width + w];
           return;
         }
       }
     }
     }
 }
-template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
 
 template <class T>
-__kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
+__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
     int index = get_global_id(0);
     int tmp = get_global_size(0);
-    for(index; index < nthreads; index+=tmp){
-    const int pw = index % pooled_width;
+    for(index; index < count; index+=tmp){
+    const int pw = index % pooled_width; 
     const int ph = (index / pooled_width) % pooled_height;
     const int c = (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
@@ -1089,31 +1086,29 @@ __kernel void void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dt
     const int hend = min(hstart + kernel_h, height);
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems
+    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
     T cumsum = FLT_MIN;
     T cumvalues = 0.;
-    bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
+    bottom_data =        bottom_data + (n * channels + c) * height * width;
     // First pass: get sum
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+        cumsum += bottom_data[h * width + w];
+        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
-  }
+    top_data[index] = cumvalues / cumsum;  }
 }
-template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTestDouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
+template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
 template <class T>
-void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int* const mask, const Dtype* const top_mask, const int num,
+void MaxPoolBackward(const int nthreads, __global T* top_diff,
+    __global int* mask, __global T* top_mask, const int num,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, const int kernel_h,
     const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, Dtype* const bottom_diff) {
+    const int pad_w, __global T* const bottom_diff) {
      int index = get_global_id(0);
      int total = get_global_size(0);
      for(index; index < nthreads; index += total){
@@ -1131,22 +1126,22 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
     const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
     T gradient = 0;
     const int offset = (n * channels + c) * pooled_height * pooled_width;
-    top_diff_slice = top_diff + offset;
+    top_diff += offset;
     if (mask) {
-      const int* const mask_slice = mask + offset;
+      mask = mask + offset;
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
+          if (mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
           }
         }
       }
     } else {
-      top_mask_slice = top_mask + offset;
+      top_mask = top_mask + offset;
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
+          if (top_mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
           }
         }
       }
@@ -1154,34 +1149,34 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
     bottom_diff[index] = gradient;
   }
 }
-template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* float bottom_diff);
-template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* float bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class T>
-__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, T* const bottom_diff){
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){
      int index = get_global_id(0);
      int total = get_global_size(0);
      for(index; index < nthreads; index += total){
-	    int w = index % width + pad;
-	    int h = (index / width) % height + pad;
-	    int c = (index / width / height) % channels;
-	    int n = index / width / height / channels;
-	    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    	    const int phend = min(h / stride_h + 1, pooled_height);
-	    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = (index / width / height) % channels;
+            int n = index / width / height / channels;
+            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            const int phend = min(h / stride_h + 1, pooled_height);
+            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
             const int pwend = min(w / stride_w + 1, pooled_width);
-	    T gradient = 0;
-	    top_diff += (n * channels + c) * pooled_height * pooled_width;
-	    for (int ph = phstart; ph < phend; ++ph) {
-	      for (int pw = pwstart; pw < pwend; ++pw) {
-		// figure out the pooling size
-		int hstart = ph * stride_h - pad_h;
-		int wstart = pw * stride_w - pad_w;
-		int hend = min(hstart + kernel_h, height + pad_h);
-		int wend = min(wstart + kernel_w, width + pad_w);
-		int pool_size = (hend - hstart) * (wend - wstart);
-		gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
-	      }
+            T gradient = 0;
+            top_diff += (n * channels + c) * pooled_height * pooled_width;
+            for (int ph = phstart; ph < phend; ++ph) {
+              for (int pw = pwstart; pw < pwend; ++pw) {
+                // figure out the pooling size
+                int hstart = ph * stride_h - pad_h;
+                int wstart = pw * stride_w - pad_w;
+                int hend = min(hstart + kernel_h, height + pad_h);
+                int wend = min(wstart + kernel_w, width + pad_w);
+                int pool_size = (hend - hstart) * (wend - wstart);
+                gradient += top_diff[ph * pooled_width + pw] / pool_size;
+              }
     }
     bottom_diff[index] = gradient;
    }
@@ -1190,52 +1185,53 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in
 template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
 template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
-template <class T>
+template <class Dtype>
 void StoPoolBackward(const int nthreads,
-    const Dtype* const rand_idx, const Dtype* const top_diff,
+    __global Dtype* rand_idx, __global Dtype* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const bottom_diff) {
+    const int stride_w, __global Dtype* bottom_diff) {
       int index = get_global_id(0);
       int total = get_global_size(0);
       for(index; index < nthreads; index += total){
-	    // find out the local index
-	    // find out the local offset
-	    const int w = index % width;
-	    const int h = (index / width) % height;
-	    const int c = (index / width / height) % channels;
-	    const int n = index / width / height / channels;
-	    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-	    const int phend = min(h / stride_h + 1, pooled_height);
-	    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-	    const int pwend = min(w / stride_w + 1, pooled_width);
-	    Dtype gradient = 0;
-	    const Dtype* const rand_idx_slice =
-		rand_idx + (n * channels + c) * pooled_height * pooled_width;
-	    const Dtype* const top_diff_slice =
-		top_diff + (n * channels + c) * pooled_height * pooled_width;
-	    for (int ph = phstart; ph < phend; ++ph) {
-	      for (int pw = pwstart; pw < pwend; ++pw) {
-		gradient += top_diff_slice[ph * pooled_width + pw] *
-		    (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
-	      }
-	    }
-	    bottom_diff[index] = gradient;
+            // find out the local index
+            // find out the local offset
+            const int w = index % width;
+            const int h = (index / width) % height;
+            const int c = (index / width / height) % channels;
+            const int n = index / width / height / channels;
+            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            const int phend = min(h / stride_h + 1, pooled_height);
+            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            const int pwend = min(w / stride_w + 1, pooled_width);
+            Dtype gradient = 0;
+            rand_idx =
+                rand_idx + (n * channels + c) * pooled_height * pooled_width;
+            top_diff =
+                top_diff + (n * channels + c) * pooled_height * pooled_width;
+            for (int ph = phstart; ph < phend; ++ph) {
+              for (int pw = pwstart; pw < pwend; ++pw) {
+                gradient += top_diff[ph * pooled_width + pw] *
+                    (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
+              }
+            }
+            bottom_diff[index] = gradient;
+
 	  }
 }
-template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel StoPoolBackward<float>(const int nthreads,
-    const float* const rand_idx, const float* const top_diff,
+template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel  void StoPoolBackward<float>(const int nthreads,
+    __global float* rand_idx, __global float* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, float* const bottom_diff);
-template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel StoPoolBackward<float>(const int nthreads,
-    const double* const rand_idx, const double* const top_diff,
+    const int stride_w, __global float* const bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward<double>(const int nthreads,
+    __global double* rand_idx, __global double* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, double* const bottom_diff);
+    const int stride_w, __global double* const bottom_diff);
 
 template <class T>
 __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
@@ -1448,6 +1444,15 @@ __kernel void add_scalar (const int n, const T alpha, __global T* y){
 template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
 template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
 
+template <typename Dtype>
+__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] = in1[index] + in2[index] ;
+}
+template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
+template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
+
 template <class T>
 __kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
         int index = get_global_id(0);
@@ -1490,121 +1495,119 @@ template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void Dropo
 template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
 
 template <class T>
-__kernel void LRNFillScale(const int nthreads, __global const T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, __global T* scale) {
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k,  __global T* scale) {
   int index = get_global_id(0);
   int tmp = get_global_size(0);
   for(index; index < nthreads; index += tmp) {
     // find out the local offset
-    int w = index % width;
-    int h = (index / width) % height;
-    int n = index / width / height;
-    int offset = (n * channels * height + h) * width + w;
-    int step = height * width;
-    in += offset;
-    scale += offset;
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
     int head = 0;
-    int pre_pad = (size - 1) / 2;
-    int post_pad = size - pre_pad - 1;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
     T accum_scale = 0;
     // fill the scale at [n, :, h, w]
     // accumulate values
-    while (head < post_pad) {
-      accum_scale += in[head * step] * in[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
+    while (head < post_pad && head < channels) {
       accum_scale += in[head * step] * in[head * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
       ++head;
     }
     // both add and subtract
     while (head < channels) {
       accum_scale += in[head * step] * in[head * step];
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+                       * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
       ++head;
     }
     // subtract only
     while (head < channels + post_pad) {
-      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
-      scale[(head - post_pad) * step] = 1. + accum_scale * alpha_over_size;
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+                       * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
       ++head;
     }
   }
 }
-template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global const float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, __global float* scale);
-template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global const double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, __global double* scale);
+
+template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
+template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
 
 template <class T>
-__kernel void LRNComputeOutput(const int nthreads, __global const T* in, __global const T* scale, const T negative_beta, __global T* out) {
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
   int index = get_global_id(0);
   int tmp = get_global_size(0);
   for(index; index < nthreads; index += tmp) 
     out[index] = in[index] * pow(scale[index], negative_beta);
 }
-template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global const float* in, __global const float* scale, const float negative_beta, __global float* out);
-template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global const double* in, __global const double* scale, const double negative_beta, __global double* out);
+template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
+template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
 
 template <class T>
-__kernel void LRNComputeDiff(const int nthreads, __global const T* bottom_data, __global const T* top_data, __global const T* scale, __global const T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
+__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
   int index = get_global_id(0);
   int tmp = get_global_size(0);
   for(index; index < nthreads; index += tmp) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int n = index / width / height;
-    int offset = (n * channels * height + h) * width + w;
-    int step = height * width;
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
     bottom_data += offset;
     top_data += offset;
     scale += offset;
     top_diff += offset;
     bottom_diff += offset;
     int head = 0;
-    int pre_pad = size - (size + 1) / 2;
-    int post_pad = size - pre_pad - 1;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
     T accum_ratio = 0;
     // accumulate values
-    while (head < post_pad) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      ++head;
-    }
-    // until we reach size, nothing needs to be subtracted
-    while (head < size) {
+    while (head < post_pad && head < channels) {
       accum_ratio += top_diff[head * step] * top_data[head * step] /
           scale[head * step];
-      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
-          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(head - post_pad) * step] * accum_ratio;
       ++head;
     }
     // both add and subtract
     while (head < channels) {
       accum_ratio += top_diff[head * step] * top_data[head * step] /
           scale[head * step];
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
-          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(head - post_pad) * step] * accum_ratio;
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+            top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+          top_diff[(head - post_pad) * step]
+            * pow(scale[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
       ++head;
     }
     // subtract only
     while (head < channels + post_pad) {
-      accum_ratio -= top_diff[(head - size) * step] *
-          top_data[(head - size) * step] / scale[(head - size) * step];
-      bottom_diff[(head - post_pad) * step] = top_diff[(head - post_pad) * step]
-          * pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(head - post_pad) * step] * accum_ratio;
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+            top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+          top_diff[(head - post_pad) * step]
+            * pow(scale[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
       ++head;
     }
-  }
+}
 }
 
-template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global const float* bottom_data, __global const float* top_data, __global const float* scale, __global const float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
-template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global const double* bottom_data, __global const double* top_data, __global const double* scale, __global const double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
 
 template <class T>
 __kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu
index dd90cac1..d93560a0 100644
--- a/src/caffe/layers/cufiles/inner_product_layer.cu
+++ b/src/caffe/layers/cufiles/inner_product_layer.cu
@@ -15,12 +15,12 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
   const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
+  caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
+      bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
   if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),
-        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
+    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
+        bias_multiplier_.gpu_data(),0,
+        this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
   }
 }
 
@@ -32,22 +32,23 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = top[0]->gpu_diff();
     const Dtype* bottom_data = bottom[0]->gpu_data();
     // Gradient with respect to weight
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
+    caffe_gpu_gemm_ex<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
+        top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)1.,
-        this->blobs_[1]->mutable_gpu_diff());
+    caffe_gpu_gemvv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
+        (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
+         (size_t)0, (Dtype)0., 1,
+        this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
   }
   if (propagate_down[0]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
-        bottom[0]->mutable_gpu_diff());
+    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
+        top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
+        bottom[0]->mutable_gpu_diff(), 0);
   }
 }
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 8edd6148..03dbbeb5 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -121,14 +121,45 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-     Forward_cpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
+      bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
+  if (bias_term_) {
+    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
+        bias_multiplier_.gpu_data(),0,
+        this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
+  }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-     Backward_cpu(top, propagate_down, bottom);
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    // Gradient with respect to weight
+    caffe_gpu_gemm_ex<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
+        top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bias
+    caffe_gpu_gemvv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
+        (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
+         (size_t)0, (Dtype)0., 1,
+        this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bottom data
+    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
+        top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
+        bottom[0]->mutable_gpu_diff(), 0);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 2dc18595..d2f1c247 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -3,6 +3,8 @@
 #include "caffe/layer.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -88,6 +90,9 @@ void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     product_layer_->Reshape(product_bottom_vec_, top);
     break;
   }
+    LFSkernel = clCreateKernel(amdDevice.Program,"LRNFillScalefloat",NULL);
+    LCDkernel = clCreateKernel(amdDevice.Program,"LRNComputeDifffloat",NULL);
+    LCOkernel = clCreateKernel(amdDevice.Program,"LRNComputeOutputfloat",NULL);
 }
 
 template <typename Dtype>
@@ -248,29 +253,67 @@ void LRNLayer<Dtype>::WithinChannelBackward(
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-      CrossChannelForward_cpu(bottom, top);
+void LRNLayer<Dtype>::CrossChannelForward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // First, compute scale
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  // We will launch one kernel for each pixel location, and have the kernel
+  // go through all the channels.
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFillScale(LFSkernel,
+      n_threads, bottom_data, num_, channels_, height_, width_, size_,
+      alpha_ / size_, k_, scale_data);
+  n_threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeOutput(LCOkernel,
+      n_threads, bottom_data, scale_data, -beta_, top_data);
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-     CrossChannelBackward_cpu(top,  propagate_down, bottom);
+void LRNLayer<Dtype>::CrossChannelBackward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeDiff(LCDkernel,
+      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      bottom[0]->mutable_gpu_diff());
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-      Forward_cpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_gpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    Backward_cpu(top, propagate_down, bottom);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_gpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
-
 #ifdef CPU_ONLY
 STUB_GPU(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index bc14fffb..98be1278 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -4,6 +4,9 @@
 #include "caffe/layer.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+
 
 namespace caffe {
 
@@ -15,6 +18,17 @@ void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   scale_ = this->layer_param_.power_param().scale();
   shift_ = this->layer_param_.power_param().shift();
   diff_scale_ = power_  * scale_;
+ //OpenCL related set up
+  ocl_setup();
+}
+
+template <typename Dtype>
+void PowerLayer<Dtype>::ocl_setup(){
+   memset_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
+   scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
+   div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
+   powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
+   mul_kernel = clCreateKernel(amdDevice.Program, "element_mul_float", NULL);
 }
 
 // Compute y = (shift + scale * x)^power
@@ -97,11 +111,74 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // Special case where we can ignore the input: scale or power is 0.
+  if (diff_scale_ == Dtype(0)) {
+    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+    ocl_memset(memset_kernel, top_data, value, count);
+    return;
+  }
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  caffe_gpu_copy(count, bottom_data, top_data);
+  if (scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, scale_, top_data);
+  }
+  if (shift_ != Dtype(0)) {
+    caffe_gpu_add_scalar(scalar_kernel, count, shift_, top_data);
+  }
+  if (power_ != Dtype(1)) {
+    caffe_gpu_powx(powx_kernel, count, top_data, power_, top_data);
+  }
 }
 
 template <typename Dtype>
 void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+      ocl_memset(memset_kernel, bottom_diff, diff_scale_,count);
+    } else {
+      const Dtype* bottom_data = bottom[0]->gpu_data();
+      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+      //               = diff_scale * y / (shift + scale * x)
+      if (power_ == Dtype(2)) {
+        // Special case for y = (shift + scale * x)^2
+        //     -> dy/dx = 2 * scale * (shift + scale * x)
+        //              = diff_scale * shift + diff_scale * scale * x
+        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
+            Dtype(0), bottom_diff);
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(scalar_kernel, count, diff_scale_ * shift_, bottom_diff);
+        }
+      } else if (shift_ == Dtype(0)) {
+        // Special case for y = (scale * x)^power
+        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
+        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
+        //              = power * y / x
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(div_kernel, count, top_data, bottom_data, bottom_diff);
+        caffe_gpu_scal(count, power_, bottom_diff);
+      } else {
+        caffe_gpu_copy(count, bottom_data, bottom_diff);
+        if (scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, scale_, bottom_diff);
+        }
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff);
+        }
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff);
+        if (diff_scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, diff_scale_, bottom_diff);
+        }
+      }
+    }
+    caffe_gpu_mul(mul_kernel, count, top_diff, bottom_diff, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 1894d0f1..af8a9123 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -21,6 +21,7 @@ void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     top[i]->ReshapeLike(*bottom[0]);
     CHECK_EQ(count_, top[i]->count());
   }
+  gpu_add_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_add_float",NULL);
 }
 
 template <typename Dtype>
@@ -52,13 +53,28 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
-    Forward_cpu(bottom, top);
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ShareData(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-     Backward_cpu(top, propagate_down, bottom);
+  if (!propagate_down[0]) { return; }
+  if (top.size() == 1) {
+    caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
+    return;
+  }
+  caffe_gpu_add(gpu_add_kernel, count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
+                bottom[0]->mutable_gpu_diff());
+  // Add remaining top blob diffs.
+  for (int i = 2; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+  }
+
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 33bb5ed5..684c85cb 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -19,6 +19,21 @@ Solver<Dtype>::Solver(const SolverParameter& param)
   Init(param);
 }
 
+template <typename Dtype>
+void Solver<Dtype>::ocl_setup(){
+   scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
+   div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
+   powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
+}
+
+/*
+template <typename Dtype>
+Solver<Dtype>::~Solver(){
+    OCL_CHECK( clReleaseKernel(scalar_kernel) );
+    OCL_CHECK( clReleaseKernel(div_kernel) );
+    OCL_CHECK( clReleaseKernel(powx_kernel) );
+}*/
+
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
     : net_() {
@@ -51,7 +66,6 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   LOG(INFO) << "Solver scaffolding done.";
   iter_ = 0;
   current_step_ = 0;
-
 }
 
 template <typename Dtype>
@@ -749,7 +763,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   case Caffe::GPU: {
 #ifndef CPU_ONLY
     // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
+    caffe_gpu_powx(powx_kernel, net_params[param_id]->count(),
         net_params[param_id]->gpu_diff(), Dtype(2),
         this->update_[param_id]->mutable_gpu_data());
 
@@ -760,14 +774,14 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->history_[param_id]->mutable_gpu_data());
 
     // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
+    caffe_gpu_powx(powx_kernel, net_params[param_id]->count(),
               this->history_[param_id]->gpu_data(), Dtype(0.5),
               this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
+    caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(),
               delta, this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_div(net_params[param_id]->count(),
+    caffe_gpu_div(div_kernel, net_params[param_id]->count(),
               net_params[param_id]->gpu_diff(),
               this->update_[param_id]->gpu_data(),
               this->update_[param_id]->mutable_gpu_data());
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 11ccbcc2..7a0e57bf 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -602,6 +602,7 @@ template <typename Dtype>
 void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
+/*
 template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 }
@@ -609,6 +610,7 @@ void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
 }
+*/
 
 template <typename Dtype>
 void mul_kernel(const int n, const Dtype* a,
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index b47a0a91..a1be91e2 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -485,6 +485,7 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c
 template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff);
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
+<<<<<<< HEAD
 template <typename Dtype> 
 void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
     cl_int ret;
@@ -510,66 +511,138 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
     ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {count * 1};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
 
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
 template void Relu_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
 template void Relu_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
-
 template <typename Dtype>
-void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
+void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
+    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
+
+    int num_kernels = channels * height * width * optnum;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operatiors)
+
     cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {N};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void caffe_gpu_sign<float>(cl_kernel Kernel,const int N,  const float* X, float* Y );
-template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double* X, double* Y );
+template void opttrans<float>(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels,
+    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
+template void opttrans<double>(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels,
+    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
 
 template <typename Dtype>
-void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha_over_size,
+    const Dtype k, Dtype* const scale){
+  cl_int ret;
+  ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in);
+  ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num);
+  ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels);
+  ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height);
+  ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width);
+  ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size);
+  ret|=clSetKernelArg(LFSkernel,7,sizeof(cl_float),(void*)&alpha_over_size);
+  ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k);
+  ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[]={nthreads};
+  size_t uiLocal_Work_Size[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) );
 }
-
-template void caffe_gpu_div<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_div<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+template void LRNFillScale<float>(cl_kernel kernel, const int nthreads, const float* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const float alpha_over_size,
+    const float k, float* const scale);
+template void LRNFillScale<double>(cl_kernel kernel, const int nthreads, const double* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const double alpha_over_size,
+    const double k, double* const scale);
 
 template <typename Dtype>
-void caffe_gpu_add_scalar (cl_kernel Kernel, const int n, const Dtype alpha, Dtype* y){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in,
+    const Dtype* const scale, const Dtype negative_beta, Dtype* const out){
+  cl_int ret;
+  ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
+  ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale);
+  ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size2[]={nthreads};
+  size_t uiLocal_Work_Size2[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
 }
+template void LRNComputeOutput<float>(cl_kernel kernel, const int nthreads, const float* const in,
+    const float* const scale, const float negative_beta, float* const out);
+template void LRNComputeOutput<double>(cl_kernel kernel, const int nthreads, const double* const in,
+    const double* const scale, const double negative_beta, double* const out);
 
-template void caffe_gpu_add_scalar<float> (cl_kernel Kernel, const int n, const float alpha, float* y);
-template void caffe_gpu_add_scalar<double> (cl_kernel Kernel, const int n, const double alpha, double* y);
+template <typename Dtype>
+void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
+    const Dtype* const bottom_data, const Dtype* const top_data,
+    const Dtype* const scale, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio, Dtype* const bottom_diff){
+  cl_int ret;
+  ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data);
+  ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data);
+  ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale);
+  ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff);
+  ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num);
+  ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels);
+  ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height);
+  ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width);
+  ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size);
+  ret|=clSetKernelArg(LCDkernel,10,sizeof(cl_float),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio);
+  ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[]={nthreads};
+  size_t uiLocal_Work_Size[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) );
+}
+template void LRNComputeDiff<float>(cl_kernel kernel, const int nthreads,
+    const float* const bottom_data, const float* const top_data,
+    const float* const scale, const float* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const float negative_beta,
+    const float cache_ratio, float* const bottom_diff);
+template void LRNComputeDiff<double>(cl_kernel kernel, const int nthreads,
+    const double* const bottom_data, const double* const top_data,
+    const double* const scale, const double* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const double negative_beta,
+    const double cache_ratio, double* const bottom_diff);
 
 template <typename Dtype>
-void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
     OCL_CHECK(ret);
     size_t Global_Work_Size[] = {n};
@@ -577,122 +650,23 @@ void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype*
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_mul<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_mul<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+template void caffe_gpu_add<float> (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y);
+template void caffe_gpu_add<double> (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y);
 
 template <typename Dtype>
-void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
     size_t Global_Work_Size[] = {n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_powx<float> (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y);
-template void caffe_gpu_powx<double> (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y);
-
-template <typename Dtype>
-void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
-{
-    cl_int ret;
-    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
-    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
-    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
-    ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_); 
-    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data); 
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void Dropout_fp_gpu<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
-template void Dropout_fp_gpu<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
-
-template <typename Dtype>
-void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
-{
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
-    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_); 
-    ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_); 
-    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff); 
-    OCL_CHECK(ret);
-   
-    size_t Global_Work_Size[] = {count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void Dropout_bp_gpu<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
-template void Dropout_bp_gpu<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
-
-typedef unsigned int uint32_t;
-struct array4x32 {  uint32_t v[4]; };
-template <typename Dtype>
-void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){
-        static unsigned c = 0;
-        unsigned nrounds = 20;
-        array4x32  rndctr4;
-        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-        
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float),   (void*)&inf);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float),   (void*)&sup);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float),   (void*)&threshold);
-        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&nrounds);
-        ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint),    (void*)&size);
-        OCL_CHECK(ret);
-
-        size_t globalws[1] = {size};
-        size_t localws[1] = {256};
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
-}
-template void caffe_gpu_bernoulli<float>(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold);
-template void caffe_gpu_bernoulli<double>(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold);
-
-
-template <typename Dtype>
-void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
-    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
-
-    int num_kernels = channels * height * width * optnum;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operatiors)
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-
-template void opttrans<float>(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels,
-    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
-template void opttrans<double>(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels,
-    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
-
+template void caffe_gpu_add_scalar<float> (cl_kernel Kernel, const int n, const float alpha, float* top_data);
+template void caffe_gpu_add_scalar<double> (cl_kernel Kernel, const int n, const double alpha, double* top_data);
 
 }  // namespace caffe
 

From 5eeeb291ab9538763bfef9c2f5b067913c24ec69 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sun, 26 Jul 2015 08:36:07 +0800
Subject: [PATCH 012/124] This patch has conv_org,relu, pooling,fc OpenCL
 porting and correct

---
 .../imagenet/train_alexnet_without_dropout.sh |   2 +-
 .../train_alexnet_without_dropout_cpu.sh      |   2 +-
 include/caffe/solver.hpp                      |   8 +-
 include/caffe/util/math_functions.hpp         |   8 +-
 include/caffe/util/ocl_wrapper.hpp            |  17 ++-
 src/caffe/OCL_kernel.cl                       |   2 +-
 src/caffe/layers/power_layer.cpp              |   2 +-
 src/caffe/solver.cpp                          |  10 +-
 src/caffe/util/math_functions.cpp             |  12 --
 src/caffe/util/ocl_wrapper.cpp                | 143 +++++++++++++++++-
 10 files changed, 165 insertions(+), 41 deletions(-)

diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh
index 5f3d3326..667543bf 100755
--- a/examples/imagenet/train_alexnet_without_dropout.sh
+++ b/examples/imagenet/train_alexnet_without_dropout.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env sh
 
-GLOG_logtostderr=1 ./build/tools/caffe train \
+GLOG_logtostderr=0 ./build/tools/caffe train \
     --solver=models/bvlc_alexnet/solver_without_dropout.prototxt
diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
index 15625f8a..12d43fc3 100755
--- a/examples/imagenet/train_alexnet_without_dropout_cpu.sh
+++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env sh
 
-GLOG_logtostderr=1 ./build/tools/caffe train \
+GLOG_logtostderr=0 ./build/tools/caffe train \
     --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 8f2767f6..a5384a15 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -61,7 +61,7 @@ class Solver {
  
  void ocl_setup();
  protected:
- cl_kernel scalar_kernel, div_kernel, powx_kernel;
+ cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
 
   DISABLE_COPY_AND_ASSIGN(Solver);
 };
@@ -99,7 +99,7 @@ class SGDSolver : public Solver<Dtype> {
 
  void ocl_setup();
  protected:
- cl_kernel scalar_kernel, div_kernel, powx_kernel;
+ cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
 
   DISABLE_COPY_AND_ASSIGN(SGDSolver);
 };
@@ -117,7 +117,7 @@ class NesterovSolver : public SGDSolver<Dtype> {
 
  void ocl_setup();
  protected:
- cl_kernel scalar_kernel, div_kernel, powx_kernel;
+ cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
 
   DISABLE_COPY_AND_ASSIGN(NesterovSolver);
 };
@@ -139,7 +139,7 @@ class AdaGradSolver : public SGDSolver<Dtype> {
 
  void ocl_setup();
  protected:
- cl_kernel scalar_kernel, div_kernel, powx_kernel;
+ cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
   DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
 };
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index be1dd09f..c2720cf5 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -97,6 +97,9 @@ void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
 template <typename Dtype>
 void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
 
@@ -124,9 +127,6 @@ void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
-void caffe_gpu_mul(cl_kernel Kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y);
-
 template <typename Dtype>
 void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
@@ -144,8 +144,6 @@ void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-template <typename Dtype>
-void caffe_gpu_powx(cl_kernel Kernel, const int n, const Dtype* a, const Dtype b, Dtype* y);
 
 unsigned int caffe_rng_rand();
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 0390ee3f..9f2cd851 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -5,6 +5,10 @@
 
 namespace caffe {
 
+typedef unsigned int uint32_t;
+template <typename Dtype>
+void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
+
 template <typename Dtype>
 void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
 
@@ -96,7 +100,6 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t
           const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
           const int spatial_dim, const bool has_ignore_label_,
           const int ignore_label_, Dtype* counts);
-}
 
 template <typename Dtype>
 void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
@@ -111,8 +114,8 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in
     const Dtype k, Dtype* const scale);
 
 template <typename Dtype>
-void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out);
+void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
+     Dtype* scale, Dtype negative_beta, Dtype* out);
 
 template <typename Dtype>
 void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
@@ -121,5 +124,11 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
     const int num, const int channels, const int height,
     const int width, const int size, const Dtype negative_beta,
     const Dtype cache_ratio, Dtype* const bottom_diff);
-  // namespace caffe
+template <typename Dtype>
+void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
+}
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
+  // namespace caffe
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 0d8328c8..9a299ced 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -1103,7 +1103,7 @@ template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void St
 template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
 template <class T>
-void MaxPoolBackward(const int nthreads, __global T* top_diff,
+__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
     __global int* mask, __global T* top_mask, const int num,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, const int kernel_h,
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 98be1278..94393f73 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -168,7 +168,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           caffe_gpu_scal(count, scale_, bottom_diff);
         }
         if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff);
+            caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff);
         }
         const Dtype* top_data = top[0]->gpu_data();
         caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff);
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 684c85cb..2d4b1da9 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -10,7 +10,7 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
-
+#include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
 
 template <typename Dtype>
@@ -22,6 +22,7 @@ Solver<Dtype>::Solver(const SolverParameter& param)
 template <typename Dtype>
 void Solver<Dtype>::ocl_setup(){
    scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
+   add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
    div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
    powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
@@ -52,6 +53,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
 //#ifndef CPU_ONLY
   //AMD device related initialization
   amdDevice.Init();
+  ocl_setup();
 //  cl_int err =  clblasSetup();
 //#else
 //  NO_GPU;
@@ -768,7 +770,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->update_[param_id]->mutable_gpu_data());
 
     // update history
-    caffe_gpu_add(net_params[param_id]->count(),
+    caffe_gpu_add(add_kernel, net_params[param_id]->count(),
         this->update_[param_id]->gpu_data(),
         this->history_[param_id]->gpu_data(),
         this->history_[param_id]->mutable_gpu_data());
@@ -778,8 +780,8 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
               this->history_[param_id]->gpu_data(), Dtype(0.5),
               this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_add_scalar(scalar_kernel, net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_gpu_data());
+    caffe_gpu_add_scalar<Dtype>(scalar_kernel, net_params[param_id]->count(),
+             delta, this->update_[param_id]->mutable_gpu_data());
 
     caffe_gpu_div(div_kernel, net_params[param_id]->count(),
               net_params[param_id]->gpu_diff(),
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 7a0e57bf..85af49d1 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -602,7 +602,6 @@ template <typename Dtype>
 void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
-/*
 template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 }
@@ -610,7 +609,6 @@ void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
 }
-*/
 
 template <typename Dtype>
 void mul_kernel(const int n, const Dtype* a,
@@ -659,16 +657,6 @@ void powx_kernel(const int n, const Dtype* a,
     const Dtype alpha, Dtype* y) {
 }
 
-template <>
-void caffe_gpu_powx<float>(const int N, const float* a,
-    const float alpha, float* y) {
-}
-
-template <>
-void caffe_gpu_powx<double>(const int N, const double* a,
-    const double alpha, double* y) {
-}
-
 
 void popc_kernel(const int n, const float* a,
     const float* b, uint8_t* y) {
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index a1be91e2..501794dc 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -7,7 +7,35 @@
 #include <stdio.h>
 #include "caffe/common.hpp"
 #include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
+typedef unsigned int uint32_t;
+struct array4x32 {  uint32_t v[4]; };
+template <typename Dtype>
+void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float),   (void*)&threshold);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_bernoulli<float>(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold);
+template void caffe_gpu_bernoulli<double>(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold);
+
 
 template <typename Dtype>
 void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){
@@ -485,7 +513,6 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c
 template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff);
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
-<<<<<<< HEAD
 template <typename Dtype> 
 void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
     cl_int ret;
@@ -512,7 +539,7 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const
     ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {count};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
@@ -580,8 +607,8 @@ template void LRNFillScale<double>(cl_kernel kernel, const int nthreads, const d
     const double k, double* const scale);
 
 template <typename Dtype>
-void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out){
+void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
+     Dtype* scale, Dtype negative_beta, Dtype* out){
   cl_int ret;
   ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
   ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
@@ -593,10 +620,10 @@ void LRNComputeOutput(cl_kernel LCOkernel, const int nthreads, const Dtype* cons
   size_t uiLocal_Work_Size2[]={256};
   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
 }
-template void LRNComputeOutput<float>(cl_kernel kernel, const int nthreads, const float* const in,
-    const float* const scale, const float negative_beta, float* const out);
-template void LRNComputeOutput<double>(cl_kernel kernel, const int nthreads, const double* const in,
-    const double* const scale, const double negative_beta, double* const out);
+template void LRNComputeOutput<float>(cl_kernel kernel, int nthreads, const float* in,
+    float* scale, float negative_beta, float* out);
+template void LRNComputeOutput<double>(cl_kernel kernel, int nthreads, const double* in,
+    double* scale, double negative_beta, double* out);
 
 template <typename Dtype>
 void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
@@ -653,6 +680,37 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype*
 template void caffe_gpu_add<float> (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y);
 template void caffe_gpu_add<double> (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y);
 
+template <typename Dtype>
+void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {N};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign<float>(cl_kernel Kernel,const int N,  const float* X, float* Y );
+template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double* X, double* Y );
+
+template <typename Dtype>
+void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_div<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_div<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+
 template <typename Dtype>
 void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){
     cl_int ret;
@@ -668,5 +726,74 @@ void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtyp
 template void caffe_gpu_add_scalar<float> (cl_kernel Kernel, const int n, const float alpha, float* top_data);
 template void caffe_gpu_add_scalar<double> (cl_kernel Kernel, const int n, const double alpha, double* top_data);
 
+template <typename Dtype>
+void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_mul<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float> (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y);
+template void caffe_gpu_powx<double> (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y);
+
+template <typename Dtype>
+void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
+{
+    cl_int ret;
+    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
+    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
+    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
+    ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_);
+    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void Dropout_fp_gpu<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
+template void Dropout_fp_gpu<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+
+template <typename Dtype>
+void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+{
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
+    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_);
+    ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_);
+    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Dropout_bp_gpu<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
+template void Dropout_bp_gpu<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
 }  // namespace caffe
 

From b72ad4d06b1d32ea053084be1e6fc590cf5404e0 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 31 Jul 2015 11:20:53 +0800
Subject: [PATCH 013/124] Port the softmax layer

---
 examples/imagenet/train_alexnet.sh   |   2 +-
 include/caffe/common.hpp             |  18 ++++
 include/caffe/common_layers.hpp      |  10 ++-
 include/caffe/loss_layers.hpp        |   1 +
 include/caffe/solver.hpp             |   1 +
 include/caffe/util/ocl_wrapper.hpp   |  24 ++++++
 src/caffe/.OCL_kernel.cl.swo         | Bin 0 -> 98304 bytes
 src/caffe/OCL_kernel.cl              | 110 +++++++++++++++++++++++++
 src/caffe/layers/base_data_layer.cpp |   2 +-
 src/caffe/layers/conv_layer.cpp      |   7 +-
 src/caffe/layers/dropout_layer.cpp   |  33 ++++----
 src/caffe/layers/pooling_layer.cpp   |   2 +-
 src/caffe/layers/softmax_layer.cpp   |  75 +++++++++++++++--
 src/caffe/solver.cpp                 |   9 +-
 src/caffe/util/ocl_wrapper.cpp       | 119 +++++++++++++++++++++++++++
 tools/caffe.cpp                      |   3 +-
 16 files changed, 384 insertions(+), 32 deletions(-)
 create mode 100644 src/caffe/.OCL_kernel.cl.swo

diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh
index e62279e2..58e5229f 100755
--- a/examples/imagenet/train_alexnet.sh
+++ b/examples/imagenet/train_alexnet.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env sh
 
-GLOG_logtostderr=1 ./build/tools/caffe train \
+GLOG_logtostderr=0 ./build/tools/caffe train \
     --solver=models/bvlc_alexnet/solver.prototxt
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index debc73a3..07d26556 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -69,6 +69,24 @@ private:\
 // A simple macro to mark codes that are not implemented, so that when the code
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
+//OpenCL:  various of defines to choose the design schemes
+/* ifdef: use CPU random generator in dropout layer
+   ifndef: use GPU randome generator*/
+//#define use_cpu_generator_dropout
+
+//#define print_memory_trace
+
+//the following are macro defines for optimization schmes in conv layer
+/*ifdef: use proposed img_packing scheme;
+ ifndef: use proposed packing im2col + sgemm scheme*/
+//#define use_packing_scheme 1
+/* global_packing_N defines packing number of the use_packing scheme
+  for intial design, we use the same packing number for all conv layers*/
+//#define global_packing_N 16
+/*ifdef: use multi-command queues for groups in conv layer;
+ ifndef: use single commane queue for groups*/
+//#define multiQ
+//#define check_gradient
 
 // OpenCL: various checks for different function calls.
 #define OCL_CHECK(condition) \
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 4e884f21..a92bb4aa 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -482,7 +482,10 @@ template <typename Dtype>
 class SoftmaxLayer : public Layer<Dtype> {
  public:
   explicit SoftmaxLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
+      : Layer<Dtype>(param) {
+     ocl_setup(); 
+  }
+  ~SoftmaxLayer();
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
@@ -499,6 +502,7 @@ class SoftmaxLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void ocl_setup();
 
   int outer_num_;
   int inner_num_;
@@ -507,6 +511,10 @@ class SoftmaxLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
   /// scale is an intermediate Blob to hold temporary results.
   Blob<Dtype> scale_;
+  protected:
+      cl_kernel channel_max_kernel,channel_subtract_kernel,exp_kernel, channel_sum_kernel;
+      cl_kernel channel_div_kernel,channel_dot_kernel;
+  
 };
 
 #ifdef USE_CUDNN
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 5aa02be1..d1408fd7 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -668,6 +668,7 @@ template <typename Dtype> class SoftmaxLayer;
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the predictions @f$ x @f$, a Blob with values in
  *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ss
  *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
  *      probability distribution over classes using the softmax function
  *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index a5384a15..79285a4a 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -51,6 +51,7 @@ class Solver {
   void Test(const int test_net_id = 0);
   virtual void SnapshotSolverState(SolverState* state) = 0;
   virtual void RestoreSolverState(const SolverState& state) = 0;
+
   void DisplayOutputBlobs(const int net_id);
 
   SolverParameter param_;
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 9f2cd851..5e86b1e2 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -88,6 +88,30 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype
 template <typename Dtype>
 void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y );
 
+template <typename Dtype>
+void kernel_channel_max(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_channel_subtract(cl_kernel Kernel, const int count,
+    const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data);
+
+template <typename Dtype>
+void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum);
+
+template <typename Dtype>
+void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+
+template <typename Dtype>
+void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot);
+
 template <typename Dtype>
 void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
           const Dtype* prob_data, const Dtype* label, Dtype* loss,
diff --git a/src/caffe/.OCL_kernel.cl.swo b/src/caffe/.OCL_kernel.cl.swo
new file mode 100644
index 0000000000000000000000000000000000000000..62349bbdafdf55d217551523bd623664e7967190
GIT binary patch
literal 98304
zcmeI534B~tz4!yDSQHci;UU5W1(P-<&C&(h0Hw5umKLF`v5wPZk_=5UAv5Vh1)mBo
z$mWLno(pdHM8O4pATCc)d?<>jsJI}DiVKMPN7N@E|L^bY_s+eOnWSwHl25;#nS0MY
z_nhC^e!t&2_uO^!PC4%QXyKt<P5eE(sp-+%t~>eIXI;JH<*#aL+B`mxFUj!<{+1qp
z%-j=~pR|1GiOXJGE9ki@!nWRge}7@_aA9nqF!#*yk)fhFvU#2U(pDbpHQ!b*TbWy5
z7#k@J9ojooE9#kZGZL7Qz=RUm_2kZu_IVQQ1qU4%?eUzGC%j@Nz>EZDBrqd^841it
zU`7Hn5}1*|j09#R@WhcoW%RjCuO>TtnJisoK6glb?r45rXg)7aoPSE<{Jtgv8_!b{
z=dVbdf0{Xesrl?MH?-I9Y>vOoeBO{a|8#SFiTT`PC;i1<Kg%5NHlM2#=XWv34>O+^
zCeA-2asI}{`DdE*3(e<06X$m|$B#Cj^Nql4`gSwN+s)_3#QA5L<42j#|4N)Uw`=M#
z=ac1wYe)U<X+D>k0<rght~ox}e4d&(zqdJV(|3O2{O;zwwJ1N5IR6}T{s{B=kHq;s
z%<<LcbCIbKHhnhT3(Wa9B+lRWnErdSIiIY5zfYV`*1zAG^Ou<GllAYn=KKlfe6s%i
z#++Yl&L`{Nz2>~FpUL|7Yjb{)xjtF{ew8?%tbe~W=dFB__3s{YeyNE+S^w@f=aqAS
zFv<FNmpOl=xjtF{?lk95G3S4qsGq+u=WYEsGEx3_nDaJ$??{~gxjDbw#Q)>O`Jb8d
z)&cqaME&@wIdAJzf8xA}cIIP70y7erk-&@uW+X5pff)(RNMJ?+PiP6``<j|W6+Thy
zX^cc`-~S$i^G9$s{0F=qUI8zGPB;MmhSB;<_%3`D-UhFNK{yIpVITMt#_!MJM)(Z8
zA1;K=&=0F&G3)`4VMyNxUxQD=yWl){IlLIU;F<7u4D!3-JMa;B3!DQd!%~<H&xHpt
z?0*X1fcL=#P=;5+@o+f22=;_WaT5Fvz6w{tyWuQY3oD=lc7ea*Y`6!mhb!O`*a9cP
zkuV2#f<NN4xE;O@?}0Z$1y;f$I1qM&hbe&H!VlpF_&B^1O3(wx!U6CM_zQN=U2q+I
z7%qiDSOeYA0z1GTu)lr+pN99s>tPID0t?_E_y<mdd*QqAMfez82CsurI31S5^I;Y|
zL>>7Kd>%ds7eha&y5m~OtXZcWx}m3YZS;!nNWRTmyST|6>yAz=RaWLYJDPQ%*62ij
zZ0gFj=jOH8aP!tibM&>Ha_u=$u|H~FHC7rQ=_^M^L>=>6qAfm3bDNI3ZZj{}?rzf&
zzl}<EdrGo77EgBjT9@p3smX3nO?H=0c2_hvB3g2}z-AhO@pFB-r^{tTCy+x*3^si_
zl+Uh~7NOGb+Uf7=Gv}|amAQqf!cw=HR2eT!6;^r~2g&wT$tRm*@iOW3WwIbO*)bVb
zRn1yrSdg0RbZK`?UfT0hlRbH9&reNuy0qJUvKy5)O?RudsIJsxr%QXDPquGS^Q&5v
zFM8Lad`;5L&^8}cuU(5$U01UEX+O;0wb2&UnJTQwmvLvRuu{u7Om?GX+>x4WFIF!G
z-`Uq{Q5~ttPA%j1MU$6ydup;LFYN}#sP=_E*^Nqj8po*i1wPrnMRirRDBpZsi}GT1
zEy`C0*P_(mR@I_{0o_+6e~(&KX&T3<_8{4fmhm)>QSE^q(x|kjag1u8f<`cnV^n*f
z5%?C>S=FL^^KmW8Hy_ucyl&`PRC;G04CsxP@pO(+L9!cdQPVj_1$s!M(w@dKO6f6T
z|2Jd1eM%<Au>bA+@nhKc_ru%aEI1vO!G7=nHvRYDoAAHz4QPXRV86c|UISv!?*$KI
ztKSCShws4&@NMk!o8dY*9Tvc|;kVe}pMiHk1<r&)7=RnGyCXONo(UIWU%wFsVL2QH
z2f^R5r>}s^;au1Z1JD7_hR3j>Z-(pP^)L=gp&kB--TVN20X`0!U;w&ccX$Tu0#ApX
z;V5kA3U+ZBUJ3ibZ?T8J0b(D&23`rr!v63hWPB@J3)jG>LFC>{y7q+oNyF_RK7dcc
zd*Ds50eazhXga4Uii9XCjaPb#BYlNUVhauDH*LrdEmkM=>Bt<@C!727m3&Y0NHja@
z9n6pLL`6B86ZN+6yF8k&6!Sy5zTz;4${12QGSqsxO@C=DYA%jcqF!Q}x0sJ5E{es`
zoH@j?MFm{rZ#Z|w$~C#;m!76$kc-P)tP&d?9Ny3e7;ypGQUdgo0&>c&N5+TaX|}hQ
zyXZjO(LkY+8yG6BBUi<~X7MaYPI^ltWeG!Se3o0y;We`(5m9Tz7bGjO=FGDv(S}m7
zFVfGU<c#!k$f(&zNUCf^N>oO~txc7}@aRy!QizuH4&}?`XblS>+_79PU#W~0*Ns;S
zxm?uTJe(gH7%KGTM)Jdj=5ndO!cB60rSWw`g_f3<D4rg3K5EXHtVTNI>}YhXv@WMJ
zQ*~~rR4%s?la`LVWGKI`Fx0AKqp~8Ipgu^*p;B*t$ORlLl*flEC-siyE4_orwXwYP
z50&zjda|Mdrsu?*N@T<yohTnBTrwLpD^*2w)0ria<G7+q#sc;X6#MYbNL`v$eM0;<
zRYz%2GPKJj#r|flrwX^XY*7aqh(na&SR5TKjYc85LoAA}hND=NjABv4QM4~e#(~Yz
zXjVF_#L?NfJQh?5WPy>$y8PH!u`t%WY~@RHD_1XDx-xhC^5b(Wmakg2-1H<3lw5m9
zmDD(-v#jzx@!d?Ntg2heV>oxHG|=o?d{2?i(RAb$y7k^%Z@yevLUTT%S+((=;@XzA
zzGqh*dy#rh&0HK6b^j_~sLSQz*#%QmQ*O;zaOgZ;qja^BZj~z}F5mmg5!e4pry?yN
z79!}M6CJMU;C%l^&+IZ9^kE!MbOv%{qK<$tv0RZ;*OyJcuP$zwsJ~{I-4JDu9H!Gv
zbkAkF;6jjP+tW)Y-*6{P0&=Pw%<PC8N~7sflSwp(O$wFTBE64QUgVpcvl>hr^-VI_
zGh+G&2_O|F)C&-fu5oe_y+kVoR9CMX%$6|b0ChO3lq&flQ;Ths>R7xM#_FjJS6iEq
zP;G6aV<fV_`5-r17dO0o<RHo<b+B$ou+-3`P&uRoI+eOw*fg41M>$tjH>-37NkK|o
z6s4(NeL&KXQ3Y!zA*fVA3d~WRfS{(u6lCJnI<P8L+XbqkW_Bs4K`l{}9F;$*f7z8F
zUcxogZOSc3Ge;!7Qz=O4W@{;k=6s`l%~mL0o3e}|ZZKhu6uLEiGOVLn+OnbTi~1gP
zK+<iXsui8_6PYnms)4hBWT3<Xl3`K?S-#Dg)6z66)9_Ifm$9m~rAZ=esA4rJtp=-_
z*#DTZO($TVhV%bo`+pri1@DIQVJ#d7^I=!`KWzSc;1;+7-U{c!DR4B*0h#}QD*Qiu
z0e8Z;;dAf-koo@tybM}E{00BSKk$3F0j`GsfH8Ol91ETB4EP)Vf?vWH;G=LUyb9L9
zQP2ucfj{9p_zB4T|21$qTm)xBFRTWc|K9^1!>{mT_!@iy-UXxZa(FR_|3Q2Y58-FH
z8$J&ofww>rPKG03HtYxwVAKB;J_GNAH-OCl=V28r2C@BjhexsPW&Z!Ga21>nXTce;
z0``Yp;IH@{Zh`CJLvRU*kK!bdHPw5=PH;bkdOLgy-UDxhSHen=wcj%T|15X}pT+Os
zhwx>%5?%`<a0Yb1{;)GVNFBKuu7wZ578n3kcc?z{H!JDf?+P5C|3c2Ot|_=8r*Q(O
zD{w`6cXK<6x?6&QRuo%O;QY<82^Ss8d!?PfIdx97a|&~!(>Q|@Cc9C8^EA%jbWUMT
zbQ))HI;SuvI*l_povCx8hgG#G-+Ww)^3BK1;P^=$*P>GAL_34Y#YVf$X`I0clig^G
zn#LKN&MC}^PU8$t=M?5dr*V+NS?3n;`9W$?Rg3b?$F(Tmd|ZpFcTTi37||PTQPVj_
z1<7u-MNQ`z6(}N&N_!f|sE$C9X;j+NI7Z=6PH9mKt6G$AKCVUi=HptFU)bfwC^fhx
zRr`)$KyS2+r*Vu5lig?;PvaQX5$GX}N;`Xus>l9+54L^<UI|J2|JT^|UxJT;*#EDF
zAvhgYKnFYzb_TKkZ-#5(64(NLZ~`>Lp3nrhW8Z%ru7Y>NS+Ew4g9Bk#_$#*kJ#al-
z0dIvf;S@L$#Qy&`cp5y2ZGR`+1fPWqVH5Ph5@?3qLDv8O5bXN@^Z738|IYwf|Gz)j
z_5ZhE-^=>{4}q-zAAplU*8lGfJHh?f^*@1cz%?Li5-x;s5Ffx{a4?Ad|2u4ZvH!me
zSHf#y1jPRD0I~mfh6l0jZ-#3@?Eftw_Wud61ong`5c~h@Aol;eLDv75L3{wm!(kvk
zfZafz0r)<M{V(hP#r|Ipr$QS%55)ez6I);G|IfnZunGDgX8)@dw7PfEOZ;so<m;^O
zg*X;p#vLrJZ(xz={Gj{uxA6*jt=nWb4QDY|trn-(h$lN(ecr$#(fL9D*=R>Jjq2O{
zpd<3?Tl=<NB|3i!7SS~7Z(*_<E#zs`-{w!jBAQ11ZT=K2qG{CM=BHXj?b~{p=={L8
zXtdv)Mg=ZRcB4&d8Wp(tQ!t69ag^$sf=M)uqf}R*69l7FhgC<YINHbUwUMo16Lke+
zdZSHh8b_%x*^M@-X&j}x0#&3@X;0%Q)fK2RjY@kON2#trCkUEUdsUP2J_XmLyib90
z(hbk+=g!TbJbp<xLsu}SH(JQkI7)@dZnTiM-zX*a|ACl{=U~Ul{QsqV$lvYQ`ZE82
z70CR*JOg0o{}1H5%>Vxtn_iy(zZJv>a3x#@;saO@%RrtB5c~f@?0T{PZ-UQ)JpaE5
z#Qt9bV*l?Bk7C>Z5WWgx|DO+H|DORXV1L*J#Qwhp#Qy&f$o#)N|1Um(Q(+m14`3gV
zH2^;cvHxZMU+n*J$irc9Fo^vx&;E=3|7ExmUJD~2_P;#)FZTb=@F2GR&2TM<{V&h{
zi~WBBbc4+Qix1!_@F#40@d4Zj?}ayk*#F0a*#8HC>;Z5uw*B`(?ElL^?Em#}Dzw4#
zK<xiJvF*kF|14Y%@{B?si2dITV*fviP5*oNF^CVq?g1cvfQ#YHZ~@33Z>PXK5I?|U
z*!X{fpMmTzcQt$*WIq7$2@Jx?&<V0G;3;qq_2zE)FvyyKb3xT5U5zIAUoD2=zslfP
zq0m3JxocBrN3ORrma9b3Sbn6hG>lc*WJ8UwFKkvD3SfPk@AL=mJAIA&PJ?5WY2T*c
z>exNxa&96Yennv{gh_?mo3F$s2a1a$pB%L=DUwsodZ80*icV(9n2*pLm^;@W8kaR?
z<~Ua}G4HY`YY-*lBwG^i7txv}vcW1w6$=aXwTxU0`mo+8q17E-qr_VtEDiOA2g~E5
z;m_hozyFyc36*^x!mSFFdv-;$bxKImy24m%V)5e%C$2g+7e?8fy43cZsEKvemGQBW
zhzCbz$xr1>3Wl0(Jx8=~3I@?MYHwk(8y#w<QF~iB1%qfBwYLRRFo>p6ds~oh5N+#8
zq6Jehh^A3{3zOYwliGgT8=Yi)Z=3~FFo>qn$FN`u2GKP77<gbPW!klUTkq+!U<wA&
zG>%eXvKwtu(>O{k2z;}RnxE4-N(HJ+qtc$vQ7X_00>y7zZ|XBY7}Faq<Y^qG!elpE
z$a<6#`~QVlY3K1Ay4e4AfBn0$?|%$p`=0}wU~l+uZ26PnFWBmT1liN?3-Eck4rFfs
z7ue@_KodNM{rxTYCfo#{gF$#H91aJ;0q}g-58j3yzB4=xK8vk=DQtq(upE}b(a;9>
zVqf0~*T8vj4u~y&Hptw251awVf;>Av7iPgd*vQ`ndB$JX&##B0U^kF`%Wr_quow=3
zr-RIqe*><7ewYWl!OyXGZ-KAFm*I=>1yJ%t1oiy6=MH1fgC`ybFAjI~mWDE(sL?0G
zCw+oPje_TAa`H&Ge`<Ybcpz8m?=KfB;nBfDabPfZWMi?fG8hI~FScrUU^L$s{w(uU
zdtWSojx<LW!f+~Leb5<{VR93}`DxPIB8Iqr*vCKe<G*3FNAX{BM4oJto8f=-$%$sW
zM0hJ*Ldze!Bow0A_PTEKysJLv|Mvk@$l`F1JcX;G<(Z;0g9jSzH9QJF7#)H6A02W?
zw9$PV<iR4HGJSlBjzm8!Y_FA1aUyKQNw6_j9GP=wb-LyJAm<0=d@+7ro|`<Atem+-
zmEz(#bIuGr3+ky#&KBL-VsKXCEq2dqmFt^qf=s57YO~!Op(y)In4HW}sbHfCD-cO^
z;%Z$e?sA@ti)IH!BvHuIazR-k*o|_`N2Fvz)JiNW23-s1Qa9tFL@G)+HAC6F5UAUJ
zmwYLynU8q9L8xy1++dr}L_Da<Pq%tB?BMh;vfBj@#imqFol!}aezff1L>MZUd*;lP
zovp#-I%oy-1hzft)Y+2k)`9dRajuWhVN?~6=WV;IJyh{0B5QU^<}tNIhg5Vo3wsGI
zpzw&wad==`c1Sj<NGrUh7al@(UHF8LjwibHUU?aAZeVmgS1t~WWImW14;FOvtAdW+
zJ{5ji%CXZkpUthEpo{^huJo9mJWP%`B&o3v>W2OF={-0<aosjIv$?>M)6}n)xy|~^
z5{|Xl$1-Iz4g91_qE)tgl8Tw-I*nJ%x)M*RhY9EB=^5qhGvL43zH!|+#XDwl6|qCl
zcww=eYq9}cK4e6-S>a^;xsi7y4;O|R#5?>}30yuq(l-z5tV%$5z}(f>-&!h2sdtDQ
zW$j1f)8Uh0Yh*kn`lyyhRbsLpAx~6OotBK+nU)k&^WrH{2Qw2w4V}fmS!snht7%rU
z=w?l(%)G3qVe!PR$w*jwZ`0J)Rv77vO8t2I3v7s?|3(HP>07jSJ~vPpDX?)58#?LD
zjL@N6$5pG~@!ucKTewT(Lqo-5GPWbCcJe!oE^Ue+ZOTVaF_^7}XaW7L_Pfkg*zYot
zpeHk;*nEfiWhO;NhNX*XICm@3Hz?YAnbS2+<xPK8S<_z|$(ijv=_;ywop3V7_C5V$
zBd9^M_;Y=a?Stghp`uPUvj>%#%hFMFshzxAqPA!Z<tu6?vFuz;>RaSt`H}vph5fi)
zvX6|~qr+uN%SLQJ)eX+hx<SX<t#gCTSvP3kN;i;cxl|F&Yy1YO1)&>?E-5QJ`)LY<
zB!n|*xf7d5%zU3rP`0WWO*=+%>6RvYz4(dM_^7r})3HS@O=ADcJMWeo``^BwO!oA<
z5k3PKW7A&*>)}jT0TCPkk6`0}3*HE)!1G{VcnDkmR#*?ug}vZDZ2B+3f51jK7H-Fm
z7aRWwm<xZycK-wX27V2{f>&d|?~UF5T-XbCguh^ue;+;p@@)G!^ubct1Mb5%zaBQg
zQSb;h`Hdj^^1Tdpft$(mn?U8g9z#7VjFqw5C}N{BGsQj=EwNn5k5v}yA80K4Af2`B
zo~M3@7g@*QVi+_rs81V%K8TH|qBSw7A3g>dgnCPb{{EazL~}cBR@*|(DpRo=8&$r$
zD&sHLH>1YKE;rVXG49l**4C9Dw(Uza+Or^Srk5c0Lj8$F=Z;~?N$29tptK0v+M>c)
z<3$vhg4m#Js#h518(UnJMvUy_o=!5{cDOi#mCbH~${9h2)!rK1B$i`+nJOnquh`)#
zoh{Lk*yxALk-7Fr7-5)QSFGmN8xyy7R$A=N`YLzlja7HvSo_Y%rMEOZ$_|2R+}9DY
zav}<p0Zc^#1X7Zn9JPtq7B#DMIr+yDCoTkIOSTzT1lu?2f<m5wVJ)(k5yV)e&iN4p
zQR)mq%=nOHrDAl}GCu=L)_r9k%Y-2-$<awkTS$?x21-L6#o@rD8VSrNlrnppP*e5y
z3y;=ery3UlTQGVroT_QzfvxCFv+0lf{k)l-Vbav6IB5evR~oHk*#tgBhJ3AoU_1|@
z6w*ZH6CQ@FL+;yB#<4t5Gs2{$R}>`1N*VEC)}^3&DJng&dfPT1AVbXRI#ZS7RUu8v
z|GIABQ1o8NMpyQFvj3(>UGEg@(Sw#69!0~&&n4ewHCaNh71yxoQf(x4MPq8Zy^;2@
zsrwX%6P{wa$jvy&oO@XP2tCA`!n5i}s>j&%hALZh>Faah=(|3r#$PQjHxQ_>frdbT
zzgZlv;o!`%O>rlzh{E4a_2nwn-W@K<w8N$roP<|_y5(bpNJ(FH5S71fZfJsR^g)!<
znpnt*Dk$~E4B?CA2itUQX_CA7kTSNhmzRhl2sx-is-TiAQyWx1io@(BOzv%qagwqU
zA>rnnB+)7~_N-2A5GmD)B-ksoWG5~In~GZ6lH5dzNu;GCHagH;KV}=LizD2YYdnw>
zg1)64+l?Zisz+O(PoeEt$@r{>x$}8d($RXL(F?1uAWTyCsJdkf%1HG+b#BrwmhfI~
z{+?P$HiHL;K6fPqs{xQA_qAH*Fb&%V(hb}Bv&}kBk$Sdbl2|u6VYi+>x?7Qio#Eqh
z;IzqaWXw!=el;+R)>c(A#?ROgA0I#;my(U8H&eW#fQbEnD0b};#{Ta%HtwgH>;EXc
zA6^Hq23ZFn&-4EZ`~H`3EBq6?{^ua;@ZSk#koEVS@O1bQHvXqT-n+LHc7g}6^KXT#
z;bM?A_J_jXvGsohvZnqW@Mbt4UI(v*5&QtM9{v<q1_v--FKgXpo%=O#Ia~rma0EO9
zeuC})U(g4K!tb&BuYpTpAGnEp|2wGsXW93e#~D%hn=YfBp~WgWO)4R3=+(x5z1F*r
z!wyu^&Fa2U59M3MjCZ$Tkfdj{gJV+3u=ky@_QfUY9T$OI8n^bf_M)~(HiI{D>v6DZ
z(AGs>%Rf*x=ITqHM6Loccr(U{hDNkH%+|D_d|R5bHvRPsai;6pN&|9k^LTzF)3TM5
zF_F8urPxRpPj(&h-IlV9rHnqcL33Z3?30yBw!J^iLJ1RQWHNzuo^7kkW28DedKhsI
zR@+<ZF;V`XS;sYJ&dfTlf3bC3+E;-GYO%5ymyGDoIo*}vQRB`KbCO@!oWWu&S<?}F
z=SH@{8TW)6do#PUO&=bnxl6H$lI~;*gAtcA4WK*i>TyV)jSWFj`WQsW(D3Hx<d)cO
z@1Z|7T*U`cZ{pTHhG}`J-o8O$CPs*VNm+(@s3l4n+wAye{lcoTioe;t&drAjzeN&2
zhCu6*G9G0NEN%PYsZ`Uz`j;%1rBJZEg2`e0&0_x#Vc!lJ`(M@q$ln*S^@pJsdf-S{
z42QvO%=-^v`>%&H;Z!&aTYojo1=)9B_T0M_J_xec{&DbB_%3$-n_&&K!(G_*?}jsA
zAGi%0|9sd2WwH0+BzPILVDtYIo8CU(E^G4-hMnLJY<rpS|0rAl>p=FddlVaA=Kkf`
z^o?*9ltATIXLY;%Uze8(wYtLUjB-ZXv8OY-qFOa?s{N4b@4}mqp0J7XGvziMHS=A%
zw#Vu`Cwt^FSMlR5!76hy)VW|Z4_Q;7M;`B6@UsjuzoGBS7d^vm91r#f(+9LZ`BOew
znRV9V<n-9cVK%EK()G!@XUt6r=&O`85e4~kU*{-EYR8U=yAi)Mv8gJWQ3vgSob35h
zDn=cugC7T7{j)b99d*~et)v-b7-V|5Qzv6WaY>Giu7nQL-H;Hr&$5A7a*OrPR0qTU
zWF-}LE{}gfg{jqYdK`@Dx?2nU(sm_Su-Is3IT}_HV}qWCoLP=$#=)S5Fkm!4d6uJj
zW80_!ERzl1Be<<h?g<ADJJvn^yF!~%y213pw!XzqMk0*uwQcblUijGPl@J_1r>cic
z%MctY4Bj3Ok0P-~7L|E9t1&FgO0Oa2aP0e6m2#Meq`sQ=Y_u2h6J(qh`+sjN;Jp*}
z|38@Lm-qYq9b}LHv#|MB!5sJp_Wt+bU9bk81HZ=RzYflY<?sU70e*$ee=~dz#1F6%
z_Jez|{l5yA!G&-hoC~jn6QB!T1hU`W4)7?xfrsHJ`~a_J-v3;X=k^c6_HTk;VC#Ph
z-UKJWt}ud~-v)msZ=ZlSz&cn0D!<v5zg3~+bE7P!*2~G|YpYoaZ7nt(bkGNquXd%m
zTTZP*ZL|x$<@J;LM{8tXPp^YS(O?)z|ELb+JE75%JYpc)3<I`G##|GHdSpkXer}|1
zmPn3RzG3e4jah%k!(N4<+(wgCTuLlSW|d$(%W`#??Znk=7+~ykeYw0X-eyrQl?7hb
z;+8z{-g|AEw?*csxja}{X0Mk|b$ctSQ8D=Sd^rpH)G{~L5GI2=@sDow!DWHH%3eHq
zG6(L~zSl}$^(r)*Iuex;hg=iOWc=PF&&tMyQ?k*)(sUJhC@9}oHY&@un<H4dp>P6)
zwW2h(F+bK98oMS1@x}A&N|j1!I47QmaI9jW3V(VV-&gB8tm2pi!}!AD3Xhw59pB2r
z)pdVwTam(R%`w`94=2?o7MZF?t<LI4C90Ho+cLS&(244D2_B!{6v#YFEsBS#T{2cf
z`d2ERK<-m4pX!+#TS`IYrF)spMxEvL!7(c~2`;DT(I!!V8TwWu<!98E$3gM=#;~0l
zmoGdcggVPE2yKeaYjD;dZOhfZlwQ3ix+uHIN|%0)UPiD*{fJcENDnBFTB>`P$NFpv
zW1-#jj=j14j8F1#UvveT>gkSx;_l%f2}?XMAVE1wP#F#bPq!rB{7K@%h+-kv@;I-)
zoqwQEu4BZJ!O(0vqXu0ZNb)Qrx{i8`F%0iQ%lM}in8l<>o?OtO2gA@oVo)@iW#zmb
z5BS~d9}Tow@?&LqX?K&!pi&H+Am*|=xjm8no^A=@lmo6!*UctY%zSI8L#+z*p`dT8
zN23a_RHfq7pxSy(Jvx(<mI^!>B`qGq6!j>Tib*L_`pVjxlzARR)>sgn4=U+MT$Pe3
zR`2~$k=9;up@QhhV#_kjF3#~K7stnj_S%Cs+C)&{Lp|h)txowKKhURwQ8{g-HX&6n
zJQZ&g_P(0x%2K-Q`LN`Iq*;YQ&AcW33sdz&IW;qO7g7{f4SuHFhoi<<B=-N#*u#&3
z?EkmW*u?iT&;M<B6PyY!fIr{|xC6cm?}fL(rEoFqfL}n?0F1&~*c%)FA?)@az(?R!
zuoND_hW|Xg4PF6@K=#T1Gj{za;4Lr!i(yxIDm=!#|99a1a0wLPFn9*sj}PDjP=r(A
zcvucc!eOuwWKF;x_yL{+DxcYQ|C8;EQ(qh1x9X%yDX>Y_jOB|X$7JjblM*y+F4cCU
zt<O)y-ma?AqOTeAxaySE8|yVQQUsl0Yovs%T_lemgCkItwvNT&#BTDKQXqltpltcY
zu{Fz9akrmw1zx|=z$>A=?83MNyg3k5^q#mr9$qJVw(tn*CVyRXEp3@OP0^iGtN<|q
zddG*$<Lc33X@BKVAvdScdnQhoQ}sp9(i(JzVc<eeXcU<HyC<%B80e$|nspy;x^nq&
zS?X$c#*&RL*!WEAZigzxQ>);r#PrTYBrGX8EDV)>EnQn~_LCQWXyHto1jzA<;#j#7
zWSGKI*$aw<jD^1%u$|V}93taGwGbv_xdhjw8X(D-8!&Br)XwONxp<RNPufoLH=2T>
z;v;F;QVA}tZK}|Iv^zS}QrUJ+--d^aY*VFoXuBFK+itWVi?&+{6#l?MnTp3Z?q%2d
zQ1uLLJXGxg{E4hn1#M^ohrE@79fIrCtm2YZgUi&4!)O#i{~a1_+qH>!>1`c-%Evd+
z^hlao<()@vWqhRmY>|2GC%>T}eo;MLsAfR*sG!5gwJ<s6$9EgT#8B5bZ?AEB%!C?G
z)fE$JKK2Uv45~(>ZqelB-A!`D{;i2jwv~b<_WyoZy+;}Q|7c?Y--b;ud;f`jUx3BX
z1hUt^ytnT}cs~4%x&N=h2+00_N5NmP|9=9XgxA5zuroY{PvHOH+wd(o6@S28*!OqB
zx8XdHz51UCU&FQ^gC+29@E7d(uY%b6{|-6mgh!bB|6lkdtb_gFDR3<|{}{Xi_5^tb
z;C{Fj-VZ}C7ak<vzkxg8hamd_d>G20WHNzAz)ICs3pIB;YE@x=Er~|0mwc3scD6L>
zOZ){=Ek>=I90h_4tfPE1@RUmf5Pue--B%WJtL^)B#ZOF7`1<a@C0oJ3SiY|)JL-v-
zQ|yvre}9kCw8A-5SpcvxHrHUMII_Y<-HAB}pYdQeVVlXy$Vq*~Wn>d>(_=F&0tjX4
zdojQNLE6-W5tCV}Fx$@R2eb8SA}3^L()id_-JnQivqF|z+LadW>3?=KQX3bB#Syyy
z{Wa7Z(}7dV%(jItLfJ&nOmzF51$5j=Roc|uNcS2+N(1$s{SfV<4xdsv#uke)iNuoX
zF7ipGe!CqOO^iONoV1%Du&9H`tCDJs1J$N{QbTJ%+rOC`9{7u5`hWbY>LnijGHJbT
zort>3Z!?rkZ>Zr1H3W07p9zCVwrbLwH}^8G>3XMfmL5ROG3ef8V&gzJ@Je2uk&NU|
z=8Y;P4v_w=hes!v%8+QhAoSXYn6qA=v?Cc^i*0rODqAZ<^{27v{iZ}R^irbg<s)GW
z3h(B$o<u};5;6>19jhuv{aQ@tNSa_mB8*Tpy{6PDp`-J?>+4!#V|%bxzZPMN&fG#d
zbM0-y5+vl4*X(fRJz1-wT9_G@MRma1Qh(xF7f$ABwu5v8sx`3Q%9{x|^Dq;}3uOiq
z<My*!kkyOqr}F%nz%`8m{HnHBlY3?PZ2x8ah$1=uf;=(Y)^aWzzp@_SlsnOg&3hCB
zC*z8hYjVdQw@Qa*FgvxNytYO>l~S~>z&6W#<~NksW3jKefwxD^UAKAe*@dxEG&)vV
z$8OJMNnuu`en6>z;YD!GjQdi2==hss+DtfNW~^BCGR$7H8Z}ADrKl*qrE%Vtt)2JV
z&3PFgvQ!e{{=1+Ngd_D6$#H2rNpwN6|BuGTJqUYO=KtG`z56BR`(@w1jUf8~6d(sn
zU=b{Ue(e8uVb{yPf9F9bHvO~VKJ5DI;35zke}9nu{AJF6H9U-ce;vFB<hlNR;cjgF
z&%n9x5|DS@--vzxT37_~Zu^7aF>L#Pz~A9-Fo11;8)^Rmd=cITJ)rV4f$d)3X{o%v
zn%D1|Ymy7ycy&x=^JpQ}Wo*uPHBR+`X2`Fl1y<>MDyAdqeCvsEY1$i1;A6JIYU>>_
zMF@32Y(Mo2E|9j?u7d+H<1&ms?5o1#PP-2xnYzspM3yvrtPE^7B34hG%Y+j}lA#PH
zLe(yjP)&N07Dz%F*5#om@U`9-l$6&Rt<Og5EM7sBMs3wc{aycW!`I_TLlKqgn&PsS
zxYRgoPm6Ow`CaOMrpHE6haO~nILYUFYz%?F$M{*?lR>qf?UBpnPx9f`)S?}@=W|jX
zxY1YC^-NEb4QoG@b7WwAtl<17BHSWxzLJNc62ZOKReUjZxM}rW{AG1IYxQNePnSxO
zs!hkbGb9xDV!GzpT&8&u?KwX>h2BK}{*j4%h{O{{VU=P1^tbaGI+x-^uEdmNg;gox
z)2rbyjHk=``poNDbh<+Sp-WeW-_WK@DrN&7!tk<&e1zeJF}*tBB{a=2Wy||2f1z!$
zrf#I@)M=}i#ml!9-F|U*RYSv(ogROZ4n<^NH?&~LnaVN&vwGYrXIVO2B3NyW!Rjxz
z?+T0~8jj6?%&Y`w>_s|va;XmByOkOBZz#_@nrgjeF;Pqz>R%#M?ZPy=k1rOP-wVnu
zyF{z5e6kj3s`@DUV8hCzS0ep48;u<+8c`!d1wp1YZL@1=PFC!H%;=^Fn_2Atql{5}
zJ@)<CunZ0W*$Y742Po_QEAUd-8*ax3@Skuj>;#`={$KV6TnY2xH~0nQJ%F!;m%?-4
z2lxp79Y$dtEQAHn4sCEP{({YL6#N5!!S!%1JQZ%lS8yS$fTzJX@e|wxp96U>z(RN-
z+=H*+F1QmGz+1@VZ}AoUM*IbEDlCHi;ft&TxE?+ae?$&S7Df(kfqbmx@}A7${K$Yj
zvp$j^E;OH5Sb6e9y>J*NXthZXl<i83ybL<tSi>{wd1~ieuxFS+G#BY3(MVx{H{@+7
z<jO;((L$yjS}Qvl-B6vmF|DUMGr@^?Ud#!f6?LFGCs=T^X4z2QZZmhX2Y6?x90?`_
zj$prqImv@da9hh_>A1Xn0}Hm^Q{~md+kep_>wU_rPGO-QR13-+6d}3t=9tZS>zYKO
z8`3Az3Vwxtv|rbt>Q!!%7ZaqeS{#fvK{a)6^NmZqp&DvZq3XwE)9I>VkW60#TceE1
z*Va~lzp&Mk*(KE@nNAK;6*3cQTZwHCvaN*uPD?n_|2*8Cm>n+@UK+d=s{h1QMpg76
zd%BXhGUXFbPeuio5!9=jezykKXl(7%fkF*jw{w4S!#+c_f$-*&^jaojsb6DO7gpsn
zWp#2~1Wv>Ej^vtyT*4)t)j=eJEz#kQ2s#UEqv~Z7UhXM{X4>Ktz06EDQ;IAmfGL)k
ziYwMrMrm^mOH{b@0%On>l~vU^>b&|3nv(5WSW;vb7Wy$%!iI`(+HVz~T~5UkLR?Xm
zVsEZDU#=`+z&)bbN+?UmZ3!2g>DXLS>T=D_ZZcPykAA!JLA`4g^>zNTh8%`+u`T4R
z8Tk5rhAyV=CY>t(zm?<EZ{lK#-$cztdC|L-k2VRu)zA1WWv{IXe3o8Zlki;nizns3
z^q1H2V#XV~zoyEE84sAgv@uxwXe&&5z+eqz+6n_Z(%5IF)GFt|8;bf<$0|q_DqGi=
z6#D$83}R-6H|Qrb*Fna1ld3sS*%ZI+WOWX)|9^_jy#l*j?EgKD-7B`g?D>BkHvSek
z8wOw=%!2P=>wg^H1_Q7Hc86Wz8L$h?#{NH+cqQ(Qa2hOy{b66&2lfVeU%*#L!+FpL
zOCf^Cu>bFeufYW%dj$5wI*@$=bFdQjg$MBsTnqBvK=Bov2<`9)K7zl)U*RFR2i^yH
zcquf)&Tub2gCE1yP=-a&0eix;VK>+no(cblKS7>FconRMeL>bF+zwxYx55DY8{97b
z1$ZA^1Pfsg_&dIa{|66%JeP1EoDD7T2tJ7a1$hoZ{-mV!pHuRjTEGgLa%{|7$K+Qu
zm@nsw10$ueLT)I(j_##Z*WuWqCL61scT@VNw1+;~s6xJ!M;U$dL%0-Jr0N4F6Cvxf
zl8>;P^r@SByVV{omCM0u)tX9Yxzt}7&Tq;MaZRp&G^e77=`dP_(ZN;{%2*o5o^Uc&
zJ~KUbe*xZKNhY)gqW_mHDRaK;^_F0}AfIFTwx-#vV&mqqE=*G9ub(?q8o)QSso6=T
zM;;qMgbcO1EYwOljs@c7TzRdRapFM^m5^jY<FBKRCn7i}j=`$h5FK(zEG=kSvD#w0
zC%Lij*}~MPsAlT4%4{-YfxLWdW#(UXpL%d82`^e7G#v7RxOLFLHnl}<lP|?zyW3Wh
z^^0^cIklK=4Ep?bY*M7aRh+mjoM=h}lrm2)9zQ9y9R#9ut-(1kg2m!N3o)zTbA6?X
zuiX65Lig<_lFE&9D%a7II>Q~5m#XBhE)bWR7FQRy*LQJ~l!sHTMVa9Z-n~3cV%O@+
zR%GlLwnYV86UjZC=BawLT9r=q7A^o!p7`!h<vOfsY@HBm!0w*ZpuzlzXms4%9`*7M
z;<T%jC`m1Y<bjT4;TnA)xIt)CXKbnIjr*XWeNPy=M%(-=y4h|ZwJDo6Cxe5NR7(3z
ziqe*k%>8FeRl8PA=}p{tCz7g_6M4alq@`S4B3W*h#8Vkb*%vnivR4TYMzmB(2g&g*
zinPUB6bGbPsA8$BLFvouZWP)sle=ushB5WR6oIHx48$mf8DET$C?&EPlv1*!lhd(D
zXLwa)>(~mcx00rU_(oP!naPSV%axW9KFE`<dBGr^%%tv&%C>Kjf(TlmJ?tEVrtd<5
zu^xKUZnBmzPC;n0)PK9>&??Mf2F)?4S`16U;1_m^bA6|XRJK#67zJa^HtH2s5pU&Q
zu{^GuNL2;(?IY+KWiYSVhud1I+rIt9bO)v7)R#sY9C>C~9_MM2Y6~J!6A2pRBCAy?
zJ)>qGf;wg<Y1D+xRC17E0!=OhV*ksU{||_*iT!`F7@YiFjO~9o><YJI?|&Aqfy?3L
zuov8ejei|n2xr1-I2azr?!OwY0@)YfNZ0`$!p^@PZiWj$_6uAO15kiI=!KtS`(FfS
zz+7mB4`SoL4CGmYKVjGZ5gve_!Rufx%!dO(Z2cd>Rq!GB09+2TufRC=dj;~aAN&>D
z{T6r+oCO0=fIhef`+Y6E9FBuOVz=K4pNF@=C2%o3gw1{moD4@B`~64A^M8yD|3N;L
ze6?sB{NL-^o2}k}f{-5?9n9;opJqgVVZG*TVA|3ohvcvJxEmik^L>4}G7lc-+3MF<
zX(k-?54MevTgHu&%^8z%_0Es-q*rEx<Xj>l&hemAWIKc~P4P49g4Ej3o9k|RlDLev
zHeuRKv#(B?IT5DJ9Fl~oF-RAkD{RWzmU~;Y&V0=*`G(ddOqdPEhw3bFFibu_v#LRT
zowQWC%vvBPW7WW$Il4q`we(iJEon2ZH;{;o(yoyPbwza&Op%6~g)UX!m_r@I==;i7
zu@=KJ(YR6W^$r8pjZGWJZZN(LIs@j6LZ`NF_Y*$c-(dexLkg*VgUWpR-n;b`i7bE0
ztQT@Jqh3g-r+k>MzX)qYN|&HqD-CoDBrBt)G?c2Kk_3q|M{Syd9wDvni1$Ug7WItD
z+6Z-3xiDw>ZA*|xi+1<9&STc(s+5j1=zEa#)~;p^iIBcWs#zdDj%d-P*Ssd(4CQ|{
zHpU#{_=`k0bWT8@Q5`yFIAu5mgBw&;zl00V-Yu9H(tdZPZQ5E~r=}b8ldoeVAE)Nh
zF>2M&O!j*Zp<S_9<tnf4OPNY;4JRT}gl8(QGFReL5)yc_vJ0C-)kMNJk(|0?wS5qH
zR_l<PNwyYSU|HHWtfgefb<<|0Rt8zBd*Z+~dZstx@2HR%leY>c2kdl+1k-=%x0iP<
z2L-4cx!!-IudLexMO&KeoJ5mJrB_c?lgjBCl}XM?_{eIyG`Y18S!+fjFx}nclW{w$
zR-;}sfb@b*cg(8+BzOKE&S#2tKo?)_IW@vaX^M1$rWv~J#w74m+M(@NeH-LRP+QV^
zuBmi9#I72TQxy`91D*M*E;gn2$>^w4`u`>!sr)aZigm+P!V3adr6O$#$M*IagMgmQ
zak;JO)rj@=%!&HGK2zc=^rGIXo?bowk7@4K|MU2``rCxvFZ=!958nrw|6c>$@NC!(
zHe>7G#l?5Rx8Y1U1`dU%!Bb!-I0KvhYHawc;1lp}I0p8Io#75_`cK0r;cD0iWDfrv
zI1^;v{s-_CxB%V=vhM%sAo~P74Iac^KN)0CzzFWbUjGF=gnhmX%!0pSo4*Kl2HErP
zI(R2+fR{i7$v>$KzD<VhUah#GS?WEZw65SoPP|mchs{_blfL4`EsaT2;mqt_U>eA)
zOH?NyHM-V|OH?<a`lht5DPFNwp15j7acJlyY_me>E3lJ`UVoq&%bI1M*DOz35F~4p
ztssaz*=u7p#z(HSp)jT%srDy$L@kySF<}ZK4%g0tUd0`YwY|iH9><K8MsYw|4Ir!K
zC4z-A3UpQ5B~&l^ck;5YAkzoNN6O687y49ZB*HEamWFuOq&};oi&Q34?qZ)9$TW0n
zXEj4DsxPNT9da4<r823Us_3dRYHs5583o3ZJ?d31lNfsd4M%DRnvOi_<}4*?|0&M3
zMp{dKB3cv8cH(gZijz(>JNf*bs|!{2U`>@|h}-5r!(r!JL>;x?3?>s)eiLfKx*t?k
zm)&@^pSWT-Sf@?WYFFvn>320c>|!#~KT&mUS)8b;woZmmYi#rSSv$X~dQMzaS#>X?
z+NJlo$vMT*(#Gt08#x(s1XU%{cC)w%bOD+Al=NrYa5!S4jP{V4vW#jpal%X;a_Q3b
zB$+G|N~J33(CC)}7cn<$wr=89?LVYVph5V*AhZpMA(wD9rgU`@LnGbY#Q8=kaCuI3
z9cKBH>nxQ5yT!4mac+1#+pipiiPxS)-)k#H=wk|NQQQ}9L)Sma5Tql^sTc%_Cp9^W
z`udc}ggFs<?R~A9Xwrck#zsS_4pX0PsykD<vLrIZ!7wxY%&Ic=^)S8p{{BL4V01hu
zvtYUut6e~`Q1NjsUL}en?1P*<)ZtfD)L%Vfx6G;_u|R3*GKC!^%N-$AK~=EssaW5_
zWp+TCLr>6Qe|Y!kR;UjX=D{qY5AxuSkO%WUUJGQ)HF-Q+pUvDDE!nC2iO|fN;DaPu
zWLW5STFBMi#(M{UpRBU?AyQMK%C;&YL5a)!|9p(zi^TS2{(rSGfWLzM|DSL^6hYnr
z*akboec1l87r-Y#o()(A^Pvr%1AoCU@I`nZjKE^(gx}%|_$k~5{{xr7CU`OI1-rrx
z_zFHN{sOoX{tG?|7vm>53%|f9tc5%93w#OQ3niEbhk~r@mv;nS3|l~a0`p-nxEFuG
zjqq+LK^yD>GOzz*xB;#Nnb$uDR>6TF{)0c`Gx!Q@fwiz2j)PX%4Zex|J`PHbNhd<R
zzx7}(t!a>#xw8?v*tyXWtkg3OL>)lu_dO~MYGI`?>h|W(_p+&KZj5#NrP$+cVOUHY
z^<)?mC$Tu9wkC;x<6sdpNT<NBthEV1rNHYq7*X#c_4X~Z<VRg<yN@|@<bJ+?h_+{n
zM@!{Oj$Qd=k-N%vY9^GR#?_uIdgoe|0kn}ofNq7PZq=d$xyW%gOWDk2xpqO_ZEni;
z^yZWc+pMl(BYWA{zC+7LVpYj>cMH6#QikEAn<_V%`Ai(wrfcG$E#zci8$acrJmoVg
zwF)P-_Qg<Rl(#GC<nnWju~#F^wwI;9g3jM$JG#^QQ|8dOpSNsTX?S$JQaFa93&-bF
ze_2pGfhmKEW6LvsH0kcK0lAgcL8OXp3%{FtE`MD?=3sTXvO{yecd(%Qpm<Vk&Pdvn
zoN%}wjX+nASCDI#zLpGETl98xj_t8v4;m@!Eb4=PTLPW^s$m|NOTB)HWJ=WlU9Es7
zgVoNE8RcX7F~?%rQO7cKW3JGd(H%VBtqvIXP>2<(qBf^n<FO)*)P11sb2~}5HKC>I
zZbh$ktlHF;9_=V5Iey2a7HmtAD1SnVw5H-_rl6gJy+-YJVPd*pb+2M;@wRrwn}(r$
z3f2C2b2e??I3?Kz8jH)gs-39~#RZu_Hl}`WYtiUNmLVB^GB#Ubl4S*xvV<V6Wa(yS
z^+v0v$eu<@KV&FhFWw%d-KemfNu<W=YgUVkl7G@7dh9Jkm2uJAOQYBwSc{Z(0Zm)}
zYN@*EdMKIjncLDNk)V>=h38ReWN5QGXfyFdmr`8fXn;sOq3kD)=(0+e?i95Auxd><
z8Mb66qq?iv(xD;s0wrC+^OdNFxyQp>Bc7_?xK>pQtKjPpPUZG0sY`-W+UrvAwPueE
zP*bzFch#SX7n5manED+m{61mZNFf*d|ApAE`(WdW{lCE2syAWhp9>q{#qd1%3AX(O
z@G95<c~}Jdf;<cGF1Qrd!F+fSd;Y^9cK)SsF<c05hTX8~--I3iMi__PLDuwt2mTu_
zgCTes><{0;uD=<c2ZPx41F!~UkH0U&)$kfP77l^u!b8~hAA^s=-Q-W@v0htzCft)+
z0_lF^#1uKt1CErjEQhmnG#8-RSqGsB!QedB8`wxzPwOkQqnlEF*-2M-2iCljqg8u-
zRIivQjaIPuvL~{#GNROuYJDm*A689?Wy2iYnz^vUIm12C_UPsCs4&JqVLQzh|GxX;
zT67-7+cQw?+hiRW^sYtgD^nf;!pw`XJ9vg%dpM%voH^$C84-0+0vPQ^emjqcinT4$
zay9`-8NucMIm%c!9l0ndBc+Ye#saE~M8d$-JR57irDsHsN|be9VH4Lj>-f668AsS}
zop;?!;t!7BMx{$G2-2m(O}ua?vmBY;jaNmCnAGx<o16M4Uv15k@21X8VRem49wTs}
zW_ywVlW0$VD9va;Mcw9X)5(}@OlvukqFOB{EyWYIoC(t#Pj0w`K(!H5Rq??UiUWfc
znt-Y#bL3;QZ5%F;l8Fwpnd&P+Z9`!}tG?8<5z)>hj?FY07q^yLWSWkgraG&SgH52I
zwz^iN>%DEXwmAnOZNpG8#n;-Y4pOH>-zm7v-oH@E4Gfjm@q7_WY}=LpfgVIED<P!8
zRWHJguXJ-YGFjnjHkfVJK)eG;T!~q&Bp#}^o5Kz{);AY_n!=t<aXs2oN|#L;!_+0)
z+zF{u5qC4$q0*d7cSqXf+kA^pa&di4J}%yjb{`K^y9`WMV3QIr0&_B@1Z)o?%hrMd
z2rt;?@>e%Y7r#E0E`5_mn`13}d>ne1w}XA$cBkPo-4et{Z)GEpsVwW5ptDu;Kh8Np
zxL{Vt^b!&qfAY!|>$A*^AnC?zQkSa#Y%R~ey9sPlN>;POM%^Z~m4LpHEqixi%=aQs
z21pf1KkKs_$Uq(#sY-b~Qo9f4_V&<7j%P~QP5$~yyp2j(?El@d^z!%s#Qwj?Sbeu*
z?|&EG4ljr0a14kaU|)C?oBv^uy#YQ4Z-o-9gcf)j{0bky-EbFN1gFEPAbSM701kpb
z;|ur-h;QH`*by$nH}G~i2l_#tA6O1cp&1T_J>W6c1l$kz!Y|=_@FlnsJ_hfE*MYoe
zpag5-IFNS@%!e1iv)~bY5B~?hfgghGHz>~|ybt~p-T<=K;7eg4JPRJiC-H0e9()Ym
z0k476VL7zH^WZ`J6?Yonh3rLe32cS}bVCd51%JSo@e}wOd<xzR7r;j7gH>=iJO}=X
zZ{zoH1Bl<_U2q<}99|4v@Jx_*B+4EJx4;!3`x2fDr@+zB3j4sH@QwT&Zi36<Lf8Pk
zZ~}D0Y}gxifIm=IZiG+6dtn?@{Ymo}%$jxDp&NQS*G8}Cj^tYhYNb2Y9i3RJtju+G
zH0waE(TV)n)Rmi;6Kze0>sTw()H;AY*6!k)=i=j7Jihj|L45OEeDhp<?Q-oo$_u?}
zjF*x2m7^n~MdAe2ckZGqB3ACsWW7r?Ki8Lg{C#!$IMfv;MThEihxA<ur38??=|X=~
zpEWP4T3K6|Dk_z=gy?;eDUP_PI=!eC=F&u^GEyVejTZ2N)Kq(Mc`5i**OJ15)KsSy
z@Pf(9dVXrECok*ysi{tt^?aY|MrGZVn(E2Rn&w$=W9ss$_Kk@L)Dw-#w;b1)V#(8u
z$+rjBm{JS4)8EurBY$zNYSfu3s>v5{XR4@D3phx1qXpcNn(CMYP48P@W9mpvb*ij8
zCNJyu)KpJi*6pdOPL(z8HQUEGD(meyGLb}4zx~j}GQcL=rw8MU9#&<TvcroUp_=O>
zN}_VawHj3m<o!8n(2|jPy&9u~b}18y?ij%klMj`=(zh*|uenkmQwcIx%8}%izO+@V
zRQJ@!xKbUGl*y6gmA)pEEptzOj4Rb4xl$cLj_1s7Lv-={iYHk2L&{gI?yZjzgE}NJ
zs3XLHyO=GOxD#8Bio3Xr4{;UGYMb3;;^o|?_M?w!howY~n`xH(75P5SkIs&q93b*8
zJZvhLvQ(qcKeoAR6DNBsV+1&DanqKj;s`G=EijjvJRLuDVzYY+Onh3VqO`^Xsc0$9
z#NqXY&ACdnzGZvfEAebuerRKUGaFT_%*w9HO|oLUuh7q$g|YG8N;KBq(U~6`%PaZB
zb2YBgkH@C&D;mB)2~O?FG;6F-86O*onm0vDme3%hL!!?1mX_#Ma^#33SU}#{fvUKw
z#heI=&=$N?blm}VSd5pRD+(KAmQ{}JfvEbL2}ODiwa+NkXm-@!_C-!BMs?&s$2=DT
zElLi|cOg*I<dS)<E(j`%99igspcBb~b{_=&fFscY7X$@_106vujEi!l(*<GRm22kv
zSQvyj((Z#W%E>hgeGqzl3Btn`CWj0u5+q2=JRM}73qt=bK^C;q%nHnrsY;fUZ#moC
z!~%)f20pD4ZgEXH+8_#lgnV5klyZt}VaelHO!<tLr5*EfVxC2a%BAtKUY_5zcY0Ad
z;*tEQI6SH!S(0_zBV4zxFi;#Bk#`uE`qkA#CH=<{c}hiIx=|X=4HQNSto5d6GCRCr
z(~gei2Zr;}>Jyf)TDCG;e%hMlt4=y@^{S|MD3>4Z%Z-ka#{TH=h$kc1>qb5Q|1<2*
zk7KKf{Xf^(aDTwYzY^XI6?hpO4=;t6z&Eh(m%>pn7k-B=e-((GFM9zV2hFe-+>cFv
z4ak0gmw?RwzX{$5V()(mTYdrj8C(2Ma0h%G<b41yhC@K?^j~6k-vAeZtQ`=${PnOS
zNcwZ|3-WRYd<$L=$3Z7N8~%vxd<*;!Tn0T5!M^Z_*vlaM0X_tOfxF>jAbSBWgdIWb
z>94>?;A~h8JHS`4i$4snhjBOn9>y-d3w{beg4^H+unu;IU-7)aC*eYnKke<4|J`h{
zag<K)wiYH4X}emOQ;AJ2>`}ErdHPIx@}ZmLo)#*K>WS!+{O(07vbMBHSu0n)3(m|w
z|7=k%TClBje!6ih?5Shcf=hQe(WW{<(C~49V7nph!Vb38vFn9x^5r&eqA)h)Cw8jM
zV9=_V+)mpcXFKi`LkiNG6NeK`Nsi3y^FOoCe`$neB~sC|9^~+Im%*MUQulqswmVAM
z2~yQO{dWBJu(S@dO*R{-`>pe{rbm-qdAJerq!Q3KRLZBMVN!9Kp(vIibIk0{S>N~+
zW=dY}ki9o-b;w}$S9Lp9(5^W3BiMS<FWeV&=7kIuTNSI`bW(?y>u3_eFk?%_t_;u{
zOWI#S1!7E@RGpzQWj;`x$W84R7uY@ho8m2Ed*>xXMVnk!0;MewwSUYAiRhu+PK|Rw
z{9CHI%)BVW^zwAwy4uG3a8Z=v>zXE4NUqukLyS8X^F(Bf358AByVaQ_5hSa>4Ek-i
zYeB7t>}n*UcHNy+8iE=edq7Si3p#bJkp|kV8WNN$TU(D!Ex;Nn4eLVo^Jp^v{{Xh}
zEyn(rcLB)X7Hs^3;6T_7zJ-l{5o`q62e1?5S^u5jr`Y)Khdy`(90g0@a5xMW!9rL7
zGVgyMHvbi{8Dzh}7lQZ!euEF-i|`g$20Mbx?cW5~LLR!|AlMJ~f#<?r@L_xaC&3}`
zXY!)*qq8#e|BM7?Brqd^841itU`7H@XbFfxr2l3-=ou>Rj0Zh@U9Ww%i%O$bIw#+o
zJoBd984tR%a09ECUQ`?r+8JRT7QFG*J1qDScUVZjq|Ni0cjY2F-l*+;ypQL1<w{Z;
j_MxY^4gL0y;tiOr6Hy$`I`Znt(S%cnpjn<2?|J_J-{&HZ

literal 0
HcmV?d00001

diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 9a299ced..df6e42ce 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -1329,6 +1329,116 @@ __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label
 template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
 template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
 
+template <class T>
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global  double* out);
+
+template <class T>
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    int index = get_global_id(0);
+    if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
+
+template <class T>
+__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = exp(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
+
+template <class T>
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+   if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* channel_sum);
+template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global double* channel_sum);
+
+template <class T>
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+    int index = get_global_id(0);
+   if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const float* channel_sum, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const double* channel_sum, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const T* data_1, __global const T* data_2,
+    __global T* channel_dot) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        T dot = 0;
+        for (int c = 0; c < channels; ++c) {
+            dot += (data_1[(n * channels + c) * spatial_dim + s]
+                 * data_2[(n * channels + c) * spatial_dim + s]);
+        }
+        channel_dot[index] = dot;
+    }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const float* data_1, __global const float* data_2,
+    __global float* channel_dot);
+template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const double* data_1, __global const double* data_2,
+    __global double* channel_dot);
+
+
 template <class T>
 __kernel void SoftmaxLossForwardGPU(const int nthreads,
           __global T* prob_data, __global T* label,__global T* loss,
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index b768f05f..fe3e4c25 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -103,7 +103,7 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
 #ifdef Track_data_transfer
 #endif
   
-  CHECK_BLOB_DATA(top[0], 20, "top[0]");  
+//  CHECK_BLOB_DATA(top[0], 20, "top[0]");  
 
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index aa2debdf..960073f2 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -96,9 +96,8 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
 
- //Forward_cpu(bottom, top);
-   CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-   CHECK_BLOB_DATA(top[0],20, "top[0]");
+  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -142,7 +141,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
   CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
   CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
- // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
+  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 7799950e..d08805d1 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -13,12 +13,9 @@ namespace caffe {
 template <typename Dtype>
 void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
     //create OpenCL related cl_mem objects and kernels
-    //if(Caffe::mode() == Caffe::GPU){
-    cl_int _err;
-    ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat",&_err);
-    ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat",&_err);
-    rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat",&_err);
-    OCL_CHECK(_err);
+    ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat", NULL);
+    ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat", NULL);
+    rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat", NULL);
     MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL);
 }
 
@@ -96,20 +93,28 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_gpu_data();
   const int count = bottom[0]->count();
   if (this->phase_ == TRAIN) {
-    unsigned int* mask =
-        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+    //unsigned int* mask =
+      //  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+#ifdef use_cpu_generator_dropout 
+    unsigned int* mask_cpu =
+        static_cast<unsigned int*>(rand_vec_.mutable_cpu_data()); 
+    caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
+    OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
+    Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+#else
 //    caffe_gpu_rng_uniform(count, mask);
- 
      caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
-    Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
-
+     Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+#endif
     // set thresholds
     // NOLINT_NEXT_LINE(whitespace/operators)
 //    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
   //      count, bottom_data, mask, uint_thres_, scale_, top_data);
    // CUDA_POST_KERNEL_CHECK;
   } else {
-    caffe_gpu_copy(count, bottom_data, top_data);
+    //caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data);
+     if(bottom_data != top_data)
+       OCL_CHECK( clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)bottom_data, (cl_mem)top_data, 0, 0, count*sizeof(Dtype), 0, NULL, NULL) );
   }
 }
 
@@ -122,8 +127,8 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = top[0]->gpu_diff();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     if (this->phase_ == TRAIN) {
-      const unsigned int* mask =
-          static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      //const unsigned int* mask =
+        //  static_cast<const unsigned int*>(rand_vec_.gpu_data());
       const int count = bottom[0]->count();
       // NOLINT_NEXT_LINE(whitespace/operators)
      // DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index a53002dd..3c94f0de 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -20,7 +20,7 @@ PoolingLayer<Dtype>::~PoolingLayer(){
   OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) );
   OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) );
   OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) );
-  OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) );
+  OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) );  
   OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) );
 }
 
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 973db6e7..07c2fcfc 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -24,6 +24,27 @@ void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   scale_.Reshape(scale_dims);
 }
 
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::ocl_setup(){
+    cl_int err = 0;
+    channel_max_kernel  = clCreateKernel(amdDevice.Program, "kernel_channel_max_float", &err);
+    channel_subtract_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_subtract_float", &err);;
+    exp_kernel = clCreateKernel(amdDevice.Program, "kernel_exp_float", &err);;
+    channel_sum_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_sum_float", &err);;
+    channel_div_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_div_float", &err);;
+    channel_dot_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_dot_float", &err);;
+}
+
+template <typename Dtype>
+SoftmaxLayer<Dtype>::~SoftmaxLayer(){
+  clReleaseKernel(channel_max_kernel);
+  clReleaseKernel(channel_subtract_kernel);
+  clReleaseKernel(exp_kernel);
+  clReleaseKernel(channel_sum_kernel);
+  clReleaseKernel(channel_div_kernel);
+  clReleaseKernel(channel_dot_kernel);
+}
+
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -88,16 +109,60 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-    Forward_cpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = bottom[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
+
+  caffe_gpu_copy(count, bottom_data, top_data);
+  // We need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  // compute max
+  // NOLINT_NEXT_LINE(whitespace/operators)
+ 
+  kernel_channel_max<Dtype>(channel_max_kernel, outer_num_, channels, inner_num_, top_data,
+      scale_data);
+  // subtract
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract<Dtype>(channel_subtract_kernel, count, outer_num_, channels, inner_num_,
+      scale_data, top_data);
+  // exponentiate
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_exp<Dtype>(exp_kernel, count, top_data, top_data);
+  // sum after exp
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_sum<Dtype>(channel_sum_kernel, outer_num_, channels, inner_num_, top_data,
+      scale_data);
+  // divide
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_div<Dtype>(channel_div_kernel, count, outer_num_, channels, inner_num_,
+      scale_data, top_data);
 }
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-   Backward_cpu(top, propagate_down, bottom);
-}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = top[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
+  caffe_gpu_copy(count, top_diff, bottom_diff);
+  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+ 
+  kernel_channel_dot<Dtype>(channel_dot_kernel, outer_num_, channels, inner_num_,
+      top_diff, top_data, scale_data);
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract<Dtype>(channel_subtract_kernel, count, outer_num_, channels, inner_num_,
+      scale_data, bottom_diff);
+  // elementwise multiplication
+  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
 
+}
 
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 2d4b1da9..715297a6 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -27,13 +27,14 @@ void Solver<Dtype>::ocl_setup(){
    powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
 
-/*
-template <typename Dtype>
-Solver<Dtype>::~Solver(){
+//template <typename Dtype>
+/*Solver<Dtype>::~Solver(){
     OCL_CHECK( clReleaseKernel(scalar_kernel) );
+    OCL_CHECK( clReleaseKernel(add_kernel) );
     OCL_CHECK( clReleaseKernel(div_kernel) );
     OCL_CHECK( clReleaseKernel(powx_kernel) );
-}*/
+}
+*/
 
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 501794dc..31384eb6 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -133,6 +133,125 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p
 template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss);
 template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
+template <typename Dtype>
+void kernel_channel_max(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* out)
+{
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {num*spatial_dim};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_channel_max<float>(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const float* data, float* out);
+template void kernel_channel_max<double>(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const double* data, double* out);
+
+template <typename Dtype>
+void kernel_channel_subtract(cl_kernel Kernel, const int count,
+    const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data){
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) );
+    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
+
+    size_t Global_Work_Size[1] = {count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_channel_subtract<float>(cl_kernel Kernel, const int count,
+    const int num, const int channels,
+    const int spatial_dim, const float* channel_max, float* data);
+template void kernel_channel_subtract<double>(cl_kernel Kernel, const int count,
+    const int num, const int channels,
+    const int spatial_dim, const double* channel_max, double* data);
+
+template <typename Dtype>
+void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out)
+{
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_exp<float>(cl_kernel Kernel, const int count, const float* data, float* out);
+template void kernel_exp<double>(cl_kernel Kernel, const int count, const double* data, double* out);
+
+template <typename Dtype>
+void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum)
+{
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
+
+    size_t Global_Work_Size[1] = {num*channels};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_channel_sum<float>(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum);
+template void kernel_channel_sum<double>(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum);
+
+template <typename Dtype>
+void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data)
+{
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
+    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
+
+    size_t Global_Work_Size[1] = {count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template  void kernel_channel_div<float>(cl_kernel Kernel, const int count, const int num, const int channels,
+    const int spatial_dim, const float* channel_sum, float* data);
+template  void kernel_channel_div<double>(cl_kernel Kernel, const int count, const int num, const int channels,
+    const int spatial_dim, const double* channel_sum, double* data);
+
+template <typename Dtype>
+void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot)
+{
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data_1) );
+    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) );
+    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) );
+      
+    size_t Global_Work_Size[1] = {num*spatial_dim};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_channel_dot<float>(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot);
+template void kernel_channel_dot<double>(cl_kernel Kernel, const int num, const int channels,
+    const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot);
+
 
 template <typename Dtype>
 void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 0b7523fc..e350866f 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -291,8 +291,9 @@ int time() {
 RegisterBrewFunction(time);
 
 int main(int argc, char** argv) {
+  FLAGS_log_dir = "./log/";
   // Print output to stderr (while still logging).
-  FLAGS_alsologtostderr = 1;
+  FLAGS_alsologtostderr = 0;
   // Usage message.
   gflags::SetUsageMessage("command line brew\n"
       "usage: caffe <command> <args>\n\n"

From 75668464c2765a6c927342154bd217d1a6cc2eca Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sat, 1 Aug 2015 00:21:29 +0800
Subject: [PATCH 014/124] cleanup the kernel interface of conv, relu and
 dropout

---
 include/caffe/util/im2col.hpp        | 14 +++++
 include/caffe/util/ocl_wrapper.hpp   |  8 +--
 include/caffe/vision_layers.hpp      | 18 +++---
 src/caffe/OCL_kernel.cl              | 91 ++++++++++++++++++++++++++-
 src/caffe/layers/base_conv_layer.cpp |  8 +--
 src/caffe/layers/dropout_layer.cpp   |  6 +-
 src/caffe/layers/relu_layer.cpp      |  4 +-
 src/caffe/util/im2col.cpp            | 93 ++++++++++++++++++++++++++++
 src/caffe/util/math_functions.cpp    |  7 ++-
 src/caffe/util/ocl_wrapper.cpp       | 24 +++----
 tools/caffe.cpp                      |  4 +-
 11 files changed, 237 insertions(+), 40 deletions(-)

diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 066eb2fc..306a5d16 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -15,6 +15,20 @@ void col2im_cpu(const Dtype* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
+template <typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset);
+
+template <typename Dtype>
+void im2col_gpu(cl_kernel Kernel,  const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 5e86b1e2..35ad695e 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -68,19 +68,19 @@ template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
 
 template <typename Dtype>
-void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
+void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
 
 template <typename Dtype>
-void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
 template <typename Dtype>
 void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
-void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
+void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
 
 template <typename Dtype>
-void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
 
 template <typename Dtype>
 void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index b46130e8..4ccdeb80 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -91,16 +91,16 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   }
 #ifndef CPU_ONLY
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-//    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-//        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
-      im2col_gpu(im2col_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, 
-                conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0);
+     im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
+           kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0);
+   //   im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, 
+     //           conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0);
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-   // col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-   //     kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
-      col2im_gpu(col2im_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_,
-                 kernel_h_, pad_h_, stride_h_, data, bottom_offset_);
+    col2im_gpu(col2im_gpu_kernel, col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
+   //   col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_,
+     //            kernel_h_, pad_h_, stride_h_, data, bottom_offset_);
   }
 #endif
 
@@ -119,7 +119,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 
 //opencl related data structures
 protected:
-  cl_kernel im2col_kernel, col2im_kernel;
+  cl_kernel im2col_gpu_kernel, col2im_gpu_kernel;
   cl_kernel oclmem_kernel;
   cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat;
   cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index df6e42ce..07f8eb16 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -783,7 +783,7 @@ __kernel void im2col(const int n, __global T* data_im, const int img_offset, con
     }
 }
 
-template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
+template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
 template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); 
 
 template <class T>
@@ -823,6 +823,95 @@ template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt
 template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
 
 
+template <class T>
+__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_col, const int col_offset) {
+    data_im = data_im + img_offset;
+    data_col = data_col + col_offset;     
+
+    int index = get_global_id(0);
+    if(index < n) {
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * kernel_h * kernel_w;
+        int h_in = h_out * stride_h - pad_h;
+        int w_in = w_out * stride_w - pad_w;
+        __global T* data_col_ptr = data_col;
+        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        __global const T* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < kernel_h; ++i) {
+            for (int j = 0; j < kernel_w; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
+                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                            data_im_ptr[i * width + j] : 0;
+                data_col_ptr += height_col * width_col;
+        }
+    }
+  }
+}
+
+template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
+           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+           const int height_col, const int width_col, __global float* data_col, const int col_offset);
+template __attribute__((mangled_name(im2col_gpu_double_kernel)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+           const int height_col, const int width_col, __global double* data_col, const int col_offset);
+
+template <class T>
+__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset) {
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+   int index = get_global_id(0);
+    if(index < n) {
+        T val = 0;
+        int w = index % width + pad_w;
+        int h = (index / width) % height + pad_h;
+        int c = index / (width * height);
+        // compute the start and end of the output
+        int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+        int w_col_end = min(w / stride_w + 1, width_col);
+        int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+        int h_col_end = min(h / stride_h + 1, height_col);
+        // equivalent implementation
+        int offset =
+            (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+        int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+        int coeff_w_col = (1 - stride_w * height_col * width_col);
+        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+            for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+            }
+        }
+        data_im[index] = val;
+  }
+}
+
+template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
+    									const int height, const int width, const int channels,
+    									const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+    									const int stride_h, const int stride_w,const int height_col, const int width_col,
+    									__global float* data_im, const int img_offset);
+template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
+                                         const int col_offset, const int height, const int width, const int channels,
+                                         const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+                                         const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+
 template <class T>
 __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){
     int index = get_global_id(0);
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 38d8952d..f321c9ff 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -32,8 +32,8 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-  im2col_kernel = clCreateKernel(amdDevice.Program,"im2colfloat", NULL);
-  col2im_kernel = clCreateKernel(amdDevice.Program,"col2imfloat", NULL);
+  im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL);
+  col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL);
   oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
   im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
   col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL);
@@ -53,8 +53,8 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 
 template <typename Dtype>
  BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
-  OCL_CHECK( clReleaseKernel(im2col_kernel) );
-  OCL_CHECK( clReleaseKernel(col2im_kernel) );
+  OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) );
+  OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) );
   OCL_CHECK( clReleaseKernel(oclmem_kernel) );
   OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) );
   OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index d08805d1..996098bc 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -100,11 +100,11 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
         static_cast<unsigned int*>(rand_vec_.mutable_cpu_data()); 
     caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
     OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
-    Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+    DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
 //    caffe_gpu_rng_uniform(count, mask);
      caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
-     Dropout_fp_gpu(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+     DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #endif
     // set thresholds
     // NOLINT_NEXT_LINE(whitespace/operators)
@@ -135,7 +135,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        // CAFFE_CUDA_NUM_THREADS>>>(
          // count, top_diff, mask, uint_thres_, scale_, bottom_diff);
     //  CUDA_POST_KERNEL_CHECK;
-       Dropout_bp_gpu(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
+       DropoutBackward(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
     } else {
       caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
     }
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index d7b0a838..8690e938 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -67,7 +67,7 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
- Relu_fp_gpu(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
+ ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
 }
 
 
@@ -85,7 +85,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 //    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
   //      count, top_diff, bottom_data, bottom_diff, negative_slope);
    // CUDA_POST_KERNEL_CHECK;
-   Relu_bp_gpu(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
+   ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
   }
 }
 
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index ac44f425..0c48257d 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -121,6 +121,99 @@ template void col2im_gpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 */
+
+//cannot use now, need to modify kernel.
+template <typename Dtype>
+void im2col_gpu(cl_kernel Kernel,  const Dtype* data_im, const int img_offset, const int channels, 
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset)
+{
+    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    int num_kernels = channels * height_col * width_col;
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&kernel_h);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&kernel_w);
+
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad_h);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_w);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&stride_h);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_w);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+    clFinish(amdDevice.CommandQueue);
+
+}
+
+template void im2col_gpu<float>(cl_kernel Kernel,  const float* data_im, const int img_offset, const int channels,       
+    				const int height, const int width, const int kernel_h, const int kernel_w,
+    				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    				float* data_col, const int col_offset);
+template void im2col_gpu<double>(cl_kernel Kernel,  const double* data_im, const int img_offset, const int channels,       
+    				const int height, const int width, const int kernel_h, const int kernel_w,
+    				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    				double* data_col, const int col_offset);
+
+//cannot use now, need to modify kernel
+template <typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset)
+{
+    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+    int num_kernels = channels * height * width;
+    
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&patch_h);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&patch_w);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_h);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&pad_w);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_h);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&stride_w);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset);
+
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+template void col2im_gpu<float>(cl_kernel Kernel, const float* data_col, const int col_offset,
+    				const int height, const int width, const int channels,
+    				const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    				const int stride_h, const int stride_w, float* data_im, const int img_offset);
+template void col2im_gpu<double>(cl_kernel Kernel, const double* data_col, const int col_offset,
+    				const int height, const int width, const int channels,
+    				const int patch_h, const int patch_w,
+    				const int pad_h, const int pad_w,const int stride_h, const int stride_w,
+    				double* data_im, const int img_offset);
+
 template <typename Dtype>
 void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 85af49d1..9ba72e41 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -240,13 +240,14 @@ void caffe_copy<double>(const int N, const double* X, double* Y) {
 
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-  CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-
+  if(X != Y)
+      CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-  CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+  if(X != Y)
+      CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 31384eb6..7b57d329 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -633,7 +633,7 @@ template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const fl
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
 template <typename Dtype> 
-void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
+void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
@@ -645,11 +645,11 @@ void Relu_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dt
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void Relu_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope);
-template void Relu_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope);
+template void ReLUForward<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope);
+template void ReLUForward<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope);
 
 template <typename Dtype> 
-void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
+void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
@@ -662,8 +662,8 @@ void Relu_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
-template void Relu_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
-template void Relu_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
+template void ReLUBackward<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
+template void ReLUBackward<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
 template <typename Dtype>
 void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
     const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
@@ -878,7 +878,7 @@ template void caffe_gpu_powx<float> (cl_kernel Kernel, const int n, const float*
 template void caffe_gpu_powx<double> (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y);
 
 template <typename Dtype>
-void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
+void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
 {
     cl_int ret;
     ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
@@ -893,11 +893,11 @@ void Dropout_fp_gpu(cl_kernel kernel, const int count, const Dtype* bottom_data,
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void Dropout_fp_gpu<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
-template void Dropout_fp_gpu<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+template void DropoutForward<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
+template void DropoutForward<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
 
 template <typename Dtype>
-void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
 {
     cl_int ret;
     ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
@@ -912,7 +912,7 @@ void Dropout_bp_gpu(cl_kernel kernel, const int count, const Dtype* top_diff, co
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void Dropout_bp_gpu<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
-template void Dropout_bp_gpu<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
+template void DropoutBackward<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
+template void DropoutBackward<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
 }  // namespace caffe
 
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index e350866f..df3b390a 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -291,9 +291,9 @@ int time() {
 RegisterBrewFunction(time);
 
 int main(int argc, char** argv) {
-  FLAGS_log_dir = "./log/";
+ // FLAGS_log_dir = "./log/";
   // Print output to stderr (while still logging).
-  FLAGS_alsologtostderr = 0;
+  FLAGS_alsologtostderr = 1;
   // Usage message.
   gflags::SetUsageMessage("command line brew\n"
       "usage: caffe <command> <args>\n\n"

From 77c7cb9a9625f72594ca972d44d180cdc4dff249 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sun, 2 Aug 2015 11:58:06 +0800
Subject: [PATCH 015/124] minor fix

---
 src/caffe/layers/base_conv_layer.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index f321c9ff..729fafb2 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -330,11 +330,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-   /* caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-        conv_out_spatial_dim_, conv_out_channels_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
-        (Dtype)0., col_buff + col_offset_ * g);
-  */
         caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
           (Dtype)1., weights,  weight_offset_ * g,
           output, top_offset_+output_offset_ * g,

From 102649bc589eee4ab050fb00cdd5eaffdbcdee8e Mon Sep 17 00:00:00 2001
From: junli <junli.gu@amd.com>
Date: Sun, 2 Aug 2015 12:15:41 +0800
Subject: [PATCH 016/124] Ported optimized scheme for conv layer

---
 include/caffe/common.hpp             |   6 +-
 include/caffe/util/im2col.hpp        |   4 +-
 include/caffe/vision_layers.hpp      |  18 ++-
 src/caffe/OCL_kernel.cl              |   8 +-
 src/caffe/layers/base_conv_layer.cpp | 184 ++++++++++++++++++++++++++-
 src/caffe/layers/conv_layer.cpp      |  54 +++++++-
 src/caffe/net.cpp                    |  16 +++
 src/caffe/util/im2col.cpp            |  71 +++++------
 tools/caffe.cpp                      |   4 +-
 9 files changed, 308 insertions(+), 57 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 07d26556..070513b5 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -22,6 +22,8 @@
 #include "caffe/device.hpp"
 #include "caffe/util/device_alternate.hpp"
 #include "caffe/util/ocl_wrapper.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/im2col.hpp"
 
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
 // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
@@ -79,10 +81,10 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-//#define use_packing_scheme 1
+#define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-//#define global_packing_N 16
+#define global_packing_N 16
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 306a5d16..862a539b 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -55,7 +55,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
 template <typename Dtype>
 void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset, const int optnum);
+    const int stride, Dtype* data_col, const int col_offset, int optnum);
 
 template <typename Dtype>
 void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
@@ -65,7 +65,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c
 template <typename Dtype>
 void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset, const int optnum);
+    const int stride, Dtype* data_im, const int img_offset, int optnum);
 
 template <typename Dtype>
 void col2im_gpu_ocl(cl_mem data_col, const int channels,
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 4ccdeb80..2d8f6390 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -113,13 +113,18 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   int weight_offset_;
   int col_offset_;
   int output_offset_;
+  int M_, N_, K_;
 
   Blob<Dtype> col_buffer_;
   Blob<Dtype> bias_multiplier_;
 
 //opencl related data structures
 protected:
-  cl_kernel im2col_gpu_kernel, col2im_gpu_kernel;
+  void forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, 
+      const vector<Blob<Dtype>*>& top,  bool skip_im2col = false) ;
+  void backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  cl_kernel im2col_kernel, col2im_kernel;
   cl_kernel oclmem_kernel;
   cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat;
   cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
@@ -184,7 +189,7 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
 
   virtual inline const char* type() const { return "Convolution"; }
 
- protected:
+protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -195,6 +200,15 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual inline bool reverse_dimensions() { return false; }
   virtual void compute_output_shape();
+  
+  virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 07f8eb16..48076725 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -1310,17 +1310,17 @@ void StoPoolBackward(const int nthreads,
 	  }
 }
 template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel  void StoPoolBackward<float>(const int nthreads,
-    __global float* rand_idx, __global float* const top_diff,
+    __global float* rand_idx, __global float* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global float* const bottom_diff);
+    const int stride_w, __global float* bottom_diff);
 template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward<double>(const int nthreads,
-    __global double* rand_idx, __global double* const top_diff,
+    __global double* rand_idx, __global double* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global double* const bottom_diff);
+    const int stride_w, __global double* bottom_diff);
 
 template <class T>
 __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 729fafb2..58eb0e1f 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -5,6 +5,7 @@
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/common.hpp"
 
 namespace caffe {
 
@@ -38,15 +39,18 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
   im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
   col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL);
   opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL);
-  ocl_Kernel_im2colfloat = clCreateKernel(amdDevice.Program,"im2colfloat_yuan",NULL);
-  ocl_Kernel_col2imfloat = clCreateKernel(amdDevice.Program,"col2imfloat_yuan",NULL);
   ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL);
   ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL);
 
+  M_ = conv_out_channels_ / group_;
+  K_ = kernel_dim_ / group_;
+  N_ =  conv_out_spatial_dim_;
+
 #ifdef use_packing_scheme
   size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
   size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
   Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
+  //printf("K_ =%d, N_=%d M_=%d, group_=%d, trans_size = %d, subtop_size=%d \n", K_, N_, M_, group_, trans_size, subtop_size);
 #endif
 }
 
@@ -58,8 +62,6 @@ template <typename Dtype>
   OCL_CHECK( clReleaseKernel(oclmem_kernel) );
   OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) );
   OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_im2colfloat) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_col2imfloat) );
   OCL_CHECK( clReleaseKernel(im2col_opt_kernel) );
   OCL_CHECK( clReleaseKernel(col2im_opt_kernel) );
 }
@@ -126,8 +128,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     conv_in_channels_ = channels_;
   }
 
-  //initializa OpenCL kernels and cl_mem objects
-    ocl_setup();
 
   // Handle the parameters: weights and biases.
   // - blobs_[0] holds the filter weights
@@ -214,6 +214,8 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
   }
+  //initializa OpenCL kernels and cl_mem objects
+    ocl_setup();
 }
 
 template <typename Dtype>
@@ -371,6 +373,176 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
           bias, (size_t)0, 1);
 }
 
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, const vector<Blob<Dtype>*>& top, bool skip_im2col){
+
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+    Dtype* top_data = top[i]->mutable_gpu_data();
+
+  Dtype* col_data = col_buffer_.mutable_gpu_data();
+  /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. 
+   N' is the M in sgemm call.*/
+  int M_org = M_ * group_;
+  int col_offset = K_ * N_;
+  int top_offset = M_ * N_;
+  int weight_offset = M_ * K_;
+  int opt_num2 = global_packing_N;
+  cl_command_queue Queue;
+  cl_event prof_event;
+  //LOG(INFO) << "conv_fp optimized scheme";
+  for (int n = 0; n < num_; n += opt_num2) {
+    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
+    /*col_offset is the offset for sgemm, including packing and groups
+    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
+    top_offset = M_ * N_ * opt_num2;
+    col_offset = K_ * N_ * opt_num2;
+    //step1: packed im2col, col_size = (K_ * group_ ) * N_
+    //this should be opt_num2 images packing together.
+    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+
+    //step 2: sgemm: Top (subTopMem) = weight * col_data
+#ifdef multiQ
+    for (int g = 0; g < group_; ++g) {
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
+          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
+       }
+   //sync two command queues
+     if(group_ == 2){
+       clFinish(amdDevice.CommandQueue);
+       clFinish(amdDevice.CommandQueue_helper);
+     }
+#else
+    Queue = amdDevice.CommandQueue;
+    //printf("M_=%d, N_=%d, K_=%d, opt_num2=%d, col_offset=%d, top_offset=%d, weight_offset=%d \n", M_, N_, K_, opt_num2, col_offset, top_offset, weight_offset);
+    for (int g = 0; g < group_; ++g) {
+       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
+          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
+       }
+#endif
+    //step 3: tranform
+    transform_gpu(ocl_Kernel_transform, (Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
+    //step 4: add bias
+    /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/
+
+   for (int z = 0; z < opt_num2; z++)
+      if (bias_term_) {
+      caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0,
+          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+          (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z);
+    }
+  }
+}
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count());
+    for (int n = 0; n < num_; ++n) {
+      caffe_gpu_gemvv<Dtype>(CblasNoTrans, M_, N_,
+          (Dtype)1., top_diff, top[i]->offset(n), N_,
+          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
+          bias_diff, (size_t)0, 1);
+     }
+   }
+
+ if (this->param_propagate_down_[0] || propagate_down[i]) {
+  const Dtype* bottom_data = bottom[i]->gpu_data();
+  Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+  Dtype* col_data = col_buffer_.mutable_gpu_data();
+  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
+  int col_offset = K_ * N_;
+  int top_offset = M_ * N_;
+  int weight_offset = M_ * K_;
+  int opt_num2 = global_packing_N;
+  int g = 0;
+  cl_command_queue Queue;
+  cl_event prof_event;
+  //LOG(INFO) << "conv_bp optimized scheme";
+
+  for (int n = 0; n < num_; n += opt_num2) {
+    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
+    /*col_offset is the offset for sgemm, including packing and groups
+    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
+    top_offset = M_ * (N_ * opt_num2);
+    col_offset = K_ * (N_ * opt_num2);
+    //step1: packed im2col, col_size = (K_ * group_ ) * N_
+    //this should be opt_num2 images packing together.
+    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+
+    //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize
+    int height_top = M_ * group_, width_top = N_;
+    //if (opt_num2 >1)
+    opttrans(opttrans_kernel, top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
+
+    //step 3: sgemm: Top (subTopMem) = weight * col_data
+    for(g = 0; g < group_; ++g) {
+#ifdef multiQ
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+#else
+       Queue =  amdDevice.CommandQueue;
+#endif
+       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+        (Dtype)1., (Dtype*)subTopMem, top_offset * g,
+        (Dtype*)transMem, col_offset * g, (Dtype)1.,
+        (Dtype*)weight_diff, weight_offset * g);
+    }
+
+   //step4:
+   if (propagate_down[i]) {
+      for (g = 0; g < group_; ++g) {
+#ifdef multiQ
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+#else
+       Queue =  amdDevice.CommandQueue;
+#endif
+       prof_event =  caffe_gpu_gemmex<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_,
+          (Dtype)1., weight,  weight_offset * g,
+          (Dtype*)subTopMem, top_offset * g,
+          (Dtype)0., (Dtype*)transMem, col_offset * g);
+      }
+    }
+
+#ifdef multiQ
+   if(group_ ==2){
+      clFinish(amdDevice.CommandQueue);
+      clFinish(amdDevice.CommandQueue_helper);
+    }
+#endif
+
+    //step5: col2im
+       col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+                  stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
+#ifdef Track_layer
+    LOG(WARNING) << "conv bp done";
+#endif
+
+   }
+  }
+ }
+}
+
 #endif  // !CPU_ONLY
 
 INSTANTIATE_CLASS(BaseConvolutionLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 960073f2..8a6a3743 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -75,6 +75,52 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const  vector<Blob<Dtype>*>& top) {
+  if (use_packing_scheme && global_packing_N >1)
+   Forward_gpu_opt(bottom, top);
+  else
+   Forward_gpu_org(bottom, top);
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    if (use_packing_scheme && global_packing_N >1)
+      Backward_gpu_opt(top, propagate_down, bottom);
+    else
+      Backward_gpu_org(top, propagate_down, bottom);
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  this->forward_gpu_opt(bottom, weight, top);
+
+/*
+#ifdef check_gradient
+   const Dtype *cpu_bottom_data = bottom[0]->cpu_data();   Dtype *cpu_top_data = (Dtype*)(*top)[0]->cpu_data();
+
+   printf("\n\nbottom data GPU:\n");
+   for(int i=0; i<channels_*height_*width_; i+=1000){
+       printf("%f,",cpu_bottom_data[i]);
+       if(i%16==15) printf("\n");
+   }
+  printf("\n\ntop data GPU:\n");
+   for(int i=0; i<M_org*N_*num_; i+=100000){
+       printf("%f,",cpu_top_data[i]);
+      if(i%16==15) printf("\n");
+   }
+  printf("\n\n");#endif
+*/
+#ifdef Track_layer
+  LOG(WARNING) << "conv fp done";
+#endif
+
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
@@ -99,9 +145,13 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
   CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
-
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+      //this->backward_gpu_opt(top, propagate_down, bottom);
+}
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 4de7a146..9869b33f 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -13,6 +13,7 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
+#include "caffe/util/benchmark.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -503,6 +504,10 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
       InputDebugInfo(i);
     }
   }
+
+  CPUTimer forward_timer;
+  forward_timer.Start();
+
   for (int i = start; i <= end; ++i) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
 //Yibing add for porting
@@ -513,6 +518,10 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
 //Yibing add for porting
     clFinish(amdDevice.CommandQueue);
   }
+
+  forward_timer.Stop();
+  printf("Forward time: %f\n\n", forward_timer.MilliSeconds());
+
   return loss;
 }
 
@@ -571,6 +580,10 @@ template <typename Dtype>
 void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_GE(end, 0);
   CHECK_LT(start, layers_.size());
+  
+  CPUTimer backward_timer;
+  backward_timer.Start();
+
   for (int i = start; i >= end; --i) {
     if (layer_need_backward_[i]) {
 //Yibing add for porting
@@ -582,6 +595,9 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
     clFinish(amdDevice.CommandQueue);
     }
   }
+
+  backward_timer.Stop();
+  printf("Backward time: %f\n\n", backward_timer.MilliSeconds());
 }
 
 template <typename Dtype>
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 0c48257d..b9257675 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -82,45 +82,42 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int stride_w, double* data_im);
 
 
-/*
 template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col) {
-   
-}
+void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, const int img_offset, int optnum){
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height * width;
 
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
+    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
+    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
+    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset);
+    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
+    OCL_CHECK(ret);
 
-// Explicit instantiation
-template void im2col_gpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col);
-template void im2col_gpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    double* data_col);
-*/
-/*
-template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im) {
+    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
-template void col2im_gpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
-template void col2im_gpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
-*/
+template void col2im_gpu_opt<float>(cl_kernel kernel, const float* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, float* data_im, const int img_offset, int optnum);
+template void col2im_gpu_opt<double>(cl_kernel kernel, const double* data_col, const int col_offset, const int channels,
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, double* data_im, const int img_offset, int optnum);
 
 //cannot use now, need to modify kernel.
 template <typename Dtype>
@@ -290,7 +287,7 @@ template void im2col_16_gpu<double>(cl_kernel Kernel, const double* data_im, con
 template <typename Dtype>
 void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset, const int optnum) {
+    const int stride, Dtype* data_col, const int col_offset, int optnum) {
 
     int height_col = (height + 2 * pad - ksize) / stride + 1;
     int width_col = (width + 2 * pad - ksize) / stride + 1;
@@ -320,10 +317,10 @@ void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset
 
 template void im2col_opt_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col, const int col_offset, const int optnum);
+    const int stride, float* data_col, const int col_offset, int optnum);
 template void im2col_opt_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col, const int col_offset, const int optnum);
+    const int stride, double* data_col, const int col_offset,  int optnum);
 
 template <typename Dtype>
 void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index df3b390a..e350866f 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -291,9 +291,9 @@ int time() {
 RegisterBrewFunction(time);
 
 int main(int argc, char** argv) {
- // FLAGS_log_dir = "./log/";
+  FLAGS_log_dir = "./log/";
   // Print output to stderr (while still logging).
-  FLAGS_alsologtostderr = 1;
+  FLAGS_alsologtostderr = 0;
   // Usage message.
   gflags::SetUsageMessage("command line brew\n"
       "usage: caffe <command> <args>\n\n"

From b09b8a4af959f3f1d8a0619bb2d8d3392fe72467 Mon Sep 17 00:00:00 2001
From: junli <junli.gu@amd.com>
Date: Sun, 2 Aug 2015 12:32:05 +0800
Subject: [PATCH 017/124] added Makefile.config. As needed by fresh git clone;
 then make all

---
 Makefile.config | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 Makefile.config

diff --git a/Makefile.config b/Makefile.config
new file mode 100644
index 00000000..2d8124d6
--- /dev/null
+++ b/Makefile.config
@@ -0,0 +1,93 @@
+## Refer to http://caffe.berkeleyvision.org/installation.html
+# Contributions simplifying and improving our build system are welcome!
+
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+# USE_CUDNN := 1
+
+# CPU-only switch (uncomment to build without GPU support).
+# CPU_ONLY := 1
+
+# To customize your choice of compiler, uncomment and set the following.
+# N.B. the default for Linux is g++ and the default for OSX is clang++
+# CUSTOM_CXX := g++
+
+# CUDA directory contains bin/ and lib/ directories that we need.
+CUDA_DIR := /usr/local/cuda
+# On Ubuntu 14.04, if cuda tools are installed via
+# "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
+# CUDA_DIR := /usr
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+		-gencode arch=compute_20,code=sm_21 \
+		-gencode arch=compute_30,code=sm_30 \
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_50,code=sm_50 \
+		-gencode arch=compute_50,code=compute_50
+
+# BLAS choice:
+# atlas for ATLAS (default)
+# mkl for MKL
+# open for OpenBlas
+BLAS := atlas
+# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
+# Leave commented to accept the defaults for your choice of BLAS
+# (which should work)!
+# BLAS_INCLUDE := /path/to/your/blas
+# BLAS_LIB := /path/to/your/blas
+
+# Homebrew puts openblas in a directory that is not on the standard search path
+# BLAS_INCLUDE := $(shell brew --prefix openblas)/include
+# BLAS_LIB := $(shell brew --prefix openblas)/lib
+
+# This is required only if you will compile the matlab interface.
+# MATLAB directory should contain the mex binary in /bin.
+# MATLAB_DIR := /usr/local
+# MATLAB_DIR := /Applications/MATLAB_R2012b.app
+
+# NOTE: this is required only if you will compile the python interface.
+# We need to be able to find Python.h and numpy/arrayobject.h.
+PYTHON_INCLUDE := /usr/include/python2.7 \
+		/usr/lib/python2.7/dist-packages/numpy/core/include
+# Anaconda Python distribution is quite popular. Include path:
+# Verify anaconda location, sometimes it's in root.
+# ANACONDA_HOME := $(HOME)/anaconda
+# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
+		# $(ANACONDA_HOME)/include/python2.7 \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+
+# We need to be able to find libpythonX.X.so or .dylib.
+PYTHON_LIB := /usr/lib
+# PYTHON_LIB := $(ANACONDA_HOME)/lib
+
+# Homebrew installs numpy in a non standard path (keg only)
+# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include
+# PYTHON_LIB += $(shell brew --prefix numpy)/lib
+
+# Uncomment to support layers written in Python (will link against Python libs)
+# WITH_PYTHON_LAYER := 1
+
+# Whatever else you find you need goes here.
+INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
+LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
+
+# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies
+# INCLUDE_DIRS += $(shell brew --prefix)/include
+# LIBRARY_DIRS += $(shell brew --prefix)/lib
+
+# Uncomment to use `pkg-config` to specify OpenCV library paths.
+# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
+# USE_PKG_CONFIG := 1
+
+BUILD_DIR := build
+DISTRIBUTE_DIR := distribute
+
+# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
+ DEBUG := 1
+
+# The ID of the GPU that 'make runtest' will use to run unit tests.
+TEST_GPUID := 0
+
+# enable pretty build (comment to see full commands)
+Q ?= @

From 270a7d9b43529fce9bac2df00a535282a8167dc4 Mon Sep 17 00:00:00 2001
From: junli <junli.gu@amd.com>
Date: Sun, 2 Aug 2015 13:31:48 +0800
Subject: [PATCH 018/124] fix some merge bugs; add ./models into git

---
 include/caffe/vision_layers.hpp      | 5 ++---
 src/caffe/layers/base_conv_layer.cpp | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 2d8f6390..a9c644c2 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -124,11 +124,10 @@ class BaseConvolutionLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top,  bool skip_im2col = false) ;
   void backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  cl_kernel im2col_kernel, col2im_kernel;
+  cl_kernel im2col_gpu_kernel, col2im_gpu_kernel;
+  cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel;
   cl_kernel oclmem_kernel;
-  cl_kernel ocl_Kernel_im2colfloat, ocl_Kernel_col2imfloat;
   cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
-  cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel;
 public:
   static cl_mem subTopMem, transMem;
   static size_t subtop_mem_size, trans_mem_size;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 58eb0e1f..5384c5e8 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -50,7 +50,6 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
   size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
   size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
   Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
-  //printf("K_ =%d, N_=%d M_=%d, group_=%d, trans_size = %d, subtop_size=%d \n", K_, N_, M_, group_, trans_size, subtop_size);
 #endif
 }
 
@@ -413,14 +412,12 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
           (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
           (Dtype)0., (Dtype*)subTopMem, top_offset * g);
        }
-   //sync two command queues
      if(group_ == 2){
        clFinish(amdDevice.CommandQueue);
        clFinish(amdDevice.CommandQueue_helper);
      }
 #else
     Queue = amdDevice.CommandQueue;
-    //printf("M_=%d, N_=%d, K_=%d, opt_num2=%d, col_offset=%d, top_offset=%d, weight_offset=%d \n", M_, N_, K_, opt_num2, col_offset, top_offset, weight_offset);
     for (int g = 0; g < group_; ++g) {
        prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,

From 3e3fb86391ca3dd104b23b5b3b1caf1388633f85 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 4 Aug 2015 01:03:12 +0800
Subject: [PATCH 019/124] cleaned up sgemm_ex interfaces; re-organized the relu
 layer kernel and wrappers

---
 include/caffe/device.hpp                 |  5 +++-
 include/caffe/neuron_layers.hpp          | 13 ++++++++--
 include/caffe/util/math_functions.hpp    |  4 ++--
 src/caffe/device.cpp                     | 30 +++++++++++++++++++++++-
 src/caffe/layers/base_conv_layer.cpp     | 18 +++++++-------
 src/caffe/layers/conv_layer.cpp          |  2 +-
 src/caffe/layers/inner_product_layer.cpp |  8 +++----
 src/caffe/layers/relu_layer.cl           | 22 +++++++++++++++++
 src/caffe/layers/relu_layer.cpp          |  9 +++----
 src/caffe/util/math_functions.cpp        | 10 ++++----
 10 files changed, 93 insertions(+), 28 deletions(-)
 create mode 100644 src/caffe/layers/relu_layer.cl

diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 07e65848..7360dacd 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -20,9 +20,10 @@ class Device{
     cl_command_queue CommandQueue;
     cl_command_queue CommandQueue_helper;
     cl_program Program; 
+    cl_device_id * pDevices;
     clblasOrder col;
     clblasOrder row;
-
+    
      
     cl_int Init(); 
     cl_int ConvertToString(const char *pFileName,std::string &Str);
@@ -30,6 +31,8 @@ class Device{
     void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
 
     void GetDeviceInfo();
+    
+    cl_program BuildProgram(const char*);    
 
     template <typename T>
     void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str);
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 67d5e0b2..bcb834de 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -9,6 +9,7 @@
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/ocl_wrapper.hpp"
 
 #define HDF5_DATA_DATASET_NAME "data"
 #define HDF5_DATA_LABEL_NAME "label"
@@ -487,9 +488,17 @@ class ReLULayer : public NeuronLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-//OpenCL related setiup
+//OpenCL related setup
   void ocl_setup();
-
+//OpenCL wrapper
+  void ReLUForward_gpu(int count, const Dtype *bottom_data,Dtype *top_data, Dtype negative_slope)
+  { 
+      ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
+  }
+  void ReLUBackward_gpu(int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff, Dtype negative_slope)
+  {
+      ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
+  }
  protected:
    cl_kernel ReLUForward_kernel;
    cl_kernel ReLUBackward_kernel;
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index c2720cf5..c9a391ac 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -31,7 +31,7 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
     Dtype* C);
 
 template <typename Dtype>
-cl_event caffe_gpu_gemmex( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
     Dtype* C, const int offC);
@@ -44,7 +44,7 @@ void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
 
 
 template <typename Dtype>
-cl_event caffe_gpu_gemm_ex(const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
     Dtype* C, const int offC);
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index bce26316..0e98ada0 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -51,7 +51,6 @@ cl_int Device::Init(){
     //printf("%s %s\n", platformName, openclVersion);
   
     GetDeviceInfo();
-    cl_device_id * pDevices;
     cl_uint uiNumDevices;
     cl_bool unified_memory = false;
     switch(Caffe::mode()) {
@@ -233,6 +232,35 @@ cl_int Device::ConvertToString(const char *pFileName,std::string &Str){
     return -1;
 }
 
+cl_program Device::BuildProgram(const char *pFileName)
+{
+      //Read our own kernel file
+    const char *pSource;
+    std::string strSource = "";
+    ConvertToString(pFileName, strSource);
+    pSource = strSource.c_str();
+    size_t uiArrSourceSize[] = {0};
+    uiArrSourceSize[0] = strlen(pSource);
+    cl_program program = NULL;
+    program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
+    if(NULL == program){
+        fprintf(stderr,"Err: Failed to create program\n");
+    }
+
+    //Build Program
+    cl_int iStatus = clBuildProgram(program, 1, pDevices, buildOption, NULL, NULL);
+    LOG(INFO) << "Build Program";
+    if(CL_SUCCESS != iStatus){
+        fprintf(stderr,"Err: Failed to build program\n");
+        char szBuildLog[16384];
+        clGetProgramBuildInfo(program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
+        std::cout << szBuildLog;
+        clReleaseProgram(program);
+        return NULL;
+    }
+  return program;
+}
+
 void Device::DisplayPlatformInfo(){
    cl_int err;
    size_t size;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 5384c5e8..4bb1d1e6 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -304,7 +304,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
         (Dtype)0., output + output_offset_ * g);
     */
     //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count());   
-    caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
+    caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
           conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
         (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
         (Dtype)0., output,  top_offset_+output_offset_ * g);
@@ -317,7 +317,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
   /*caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
       height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
       (Dtype)1., output);*/
-     caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
           height_out_*width_out_, 1, (Dtype)1., bias, 0,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
           (Dtype)1., output, top_offset_);
@@ -331,7 +331,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-        caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
+        caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
           (Dtype)1., weights,  weight_offset_ * g,
           output, top_offset_+output_offset_ * g,
           (Dtype)0., col_buff, col_offset_ * g);
@@ -354,7 +354,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
         kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)1., weights + weight_offset_ * g);*/
-      caffe_gpu_gemmex<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_,
+      caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output, top_offset_,
         (Dtype*)col_buff, col_offset_ * g, (Dtype)1.,
         (Dtype*)weights, weight_offset_ * g);
@@ -408,7 +408,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
     for (int g = 0; g < group_; ++g) {
        if(g == 0) Queue = amdDevice.CommandQueue;
        else Queue =  amdDevice.CommandQueue_helper;
-       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
           (Dtype)0., (Dtype*)subTopMem, top_offset * g);
        }
@@ -419,7 +419,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
 #else
     Queue = amdDevice.CommandQueue;
     for (int g = 0; g < group_; ++g) {
-       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
           (Dtype)0., (Dtype*)subTopMem, top_offset * g);
        }
@@ -431,7 +431,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
 
    for (int z = 0; z < opt_num2; z++)
       if (bias_term_) {
-      caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
           N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
           (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z);
@@ -499,7 +499,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       prof_event = caffe_gpu_gemmex<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
         (Dtype)1., (Dtype*)subTopMem, top_offset * g,
         (Dtype*)transMem, col_offset * g, (Dtype)1.,
         (Dtype*)weight_diff, weight_offset * g);
@@ -514,7 +514,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       prof_event =  caffe_gpu_gemmex<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_,
+       prof_event =  caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_,
           (Dtype)1., weight,  weight_offset * g,
           (Dtype*)subTopMem, top_offset * g,
           (Dtype)0., (Dtype*)transMem, col_offset * g);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 8a6a3743..77697023 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -148,7 +148,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-      //this->backward_gpu_opt(top, propagate_down, bottom);
+      this->backward_gpu_opt(top, propagate_down, bottom);
 }
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 03dbbeb5..4242afa3 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -125,10 +125,10 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
   const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
+  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
       bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
   if (bias_term_) {
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
         bias_multiplier_.gpu_data(),0,
         this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
   }
@@ -142,7 +142,7 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = top[0]->gpu_diff();
     const Dtype* bottom_data = bottom[0]->gpu_data();
     // Gradient with respect to weight
-    caffe_gpu_gemm_ex<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
+    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
         top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
@@ -156,7 +156,7 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (propagate_down[0]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bottom data
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
         top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
         bottom[0]->mutable_gpu_diff(), 0);
   }
diff --git a/src/caffe/layers/relu_layer.cl b/src/caffe/layers/relu_layer.cl
new file mode 100644
index 00000000..cebe24cd
--- /dev/null
+++ b/src/caffe/layers/relu_layer.cl
@@ -0,0 +1,22 @@
+template <class T>
+__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
+        int index = get_global_id(0);
+        if(index < count)
+                out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
+}
+
+//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
+
+template <class T>
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
+        int index = get_global_id(0);
+        if(index < count)
+                out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
+}
+
+template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
+
+
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 8690e938..6ee3237a 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -7,9 +7,10 @@
 namespace caffe {
 template <typename Dtype>
 void ReLULayer<Dtype>::ocl_setup(){
+    cl_program program = amdDevice.BuildProgram("src/caffe/layers/relu_layer.cl");
     cl_int _err=0;
-    ReLUForward_kernel = clCreateKernel(amdDevice.Program,"ReLUForwardfloat",&_err);
-    ReLUBackward_kernel = clCreateKernel(amdDevice.Program,"ReLUBackwardfloat",&_err);
+    ReLUForward_kernel = clCreateKernel(program,"ReLUForwardfloat",&_err);
+    ReLUBackward_kernel = clCreateKernel(program,"ReLUBackwardfloat",&_err);
 }
 
 template <typename Dtype>
@@ -67,7 +68,7 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
- ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
+ ReLUForward_gpu(count,bottom_data,top_data,negative_slope);
 }
 
 
@@ -85,7 +86,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 //    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
   //      count, top_diff, bottom_data, bottom_diff, negative_slope);
    // CUDA_POST_KERNEL_CHECK;
-   ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
+   ReLUBackward_gpu(count,top_diff,bottom_data,bottom_diff,negative_slope);
   }
 }
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 9ba72e41..8bc16ea3 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -64,7 +64,7 @@ void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
 }
 
 template <>
-cl_event caffe_gpu_gemm_ex<float>(const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
     cl_event event;
@@ -78,7 +78,7 @@ cl_event caffe_gpu_gemm_ex<float>(const CBLAS_TRANSPOSE TransA,
 }
 
 template <>
-cl_event caffe_gpu_gemm_ex<double>(const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
     cl_event event;
@@ -93,7 +93,7 @@ cl_event caffe_gpu_gemm_ex<double>(const CBLAS_TRANSPOSE TransA,
 
 
 template <>
-cl_event caffe_gpu_gemmex<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
     cl_event event;
@@ -108,7 +108,7 @@ cl_event caffe_gpu_gemmex<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE
  }
 
 template <>
-cl_event caffe_gpu_gemmex<double>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+cl_event caffe_gpu_gemm<double>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
     cl_event event;
@@ -620,12 +620,14 @@ template<>
 void caffe_gpu_sign<float>(const int N, const float *X, float *Y){
    cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
    caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
+   clReleaseKernel(caffe_gpu_sign_kernel);  
 }
 
 template<>
 void caffe_gpu_sign<double>(const int N, const double *X, double *Y){
    cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
    caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
+   clReleaseKernel(caffe_gpu_sign_kernel);
 }
 
 template <>

From 18257693490162cb3cc894d6fffb97fe457e7ad9 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 4 Aug 2015 04:43:24 +0800
Subject: [PATCH 020/124] cleaning up the conv opt interfaces

---
 src/caffe/layers/base_data_layer.cpp | 10 +++++++++-
 src/caffe/net.cpp                    | 14 +++++++++-----
 src/caffe/util/ocl_util.cpp          | 12 +-----------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index fe3e4c25..60dfde75 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -3,6 +3,7 @@
 
 #include "caffe/data_layers.hpp"
 #include "caffe/util/io.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
@@ -86,8 +87,12 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const  vector<Blob<Dtype>*>& top) {
+  printf("HHHHHH Data forward time: n\n");
   // First, join the thread
   JoinPrefetchThread();
+  CPUTimer forward_timer;
+  forward_timer.Start();
+
   // Copy the data from prefetch thread to data_layer
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
    top[0]->ReshapeLike(prefetch_data_);
@@ -99,7 +104,10 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
    }
   clFinish(amdDevice.CommandQueue);
-  
+  forward_timer.Stop();
+  printf("Data forward time: %f\n\n", forward_timer.MilliSeconds());
+
+ 
 #ifdef Track_data_transfer
 #endif
   
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 9869b33f..4d20cdd7 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -506,17 +506,21 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   }
 
   CPUTimer forward_timer;
+  CPUTimer layer_timer;
   forward_timer.Start();
 
   for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
-//Yibing add for porting
-   printf("Forwarding %s\n",layer_names_[i].c_str());
-   Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
+   //double begin_time = GettickCount();
+    layer_timer.Start();
+   //printf("Forwarding %s\n",layer_names_[i].c_str());
+    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
-//Yibing add for porting
     clFinish(amdDevice.CommandQueue);
+    //double end_time = GettickCount();
+    layer_timer.Stop();
+    //printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), end_time-begin_time);
+    printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds());
   }
 
   forward_timer.Stop();
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 8feead82..eef9f544 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -13,11 +13,6 @@ namespace caffe {
 template <typename Dtype>
 void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){
     cl_int err=0;
-    //cl_kernel Kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", &err);
-    //if(NULL==Kernel){
-    //    fprintf(stderr, "Failed to create kernel %d\n", err);
-    //}   
- 
     err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
     err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value);
     err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
@@ -35,12 +30,7 @@ template void ocl_memset<double>(cl_kernel Kernel, double* buffer, const double
 
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){
-   cl_int err=0;
-  //  cl_kernel Kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
-   // if(NULL==Kernel){
-   //     fprintf(stderr, "Failed to create kernel %d\n", err);
-   // }
-
+    cl_int err=0;
     err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
     err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value);
     err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);

From 0bf247998ff3c19f12aba682519db61299d031ad Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 4 Aug 2015 04:54:27 +0800
Subject: [PATCH 021/124] conv opt cleaning up cont.

---
 include/caffe/vision_layers.hpp      | 12 ++++++----
 src/caffe/layers/base_conv_layer.cpp | 31 +++++++++++++++++-------
 src/caffe/layers/conv_layer.cpp      | 36 ++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index a9c644c2..e763d31a 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -93,15 +93,18 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
      im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0);
-   //   im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, 
-     //           conv_in_width_, kernel_h_, pad_h_, stride_h_, col_buff, 0);
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
     col2im_gpu(col2im_gpu_kernel, col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
-   //   col2im_gpu(col2im_gpu_kernel, col_buff, 0, conv_in_channels_, conv_in_height_, conv_in_width_,
-     //            kernel_h_, pad_h_, stride_h_, data, bottom_offset_);
   }
+  inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) {
+     im2col_gpu(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
+           kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
+  }
+  inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) {
+    col2im_gpu(col2im_opt_kernel, (Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
+        kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 #endif
 
   int conv_out_channels_;
@@ -114,6 +117,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   int col_offset_;
   int output_offset_;
   int M_, N_, K_;
+  int opt_num2;
 
   Blob<Dtype> col_buffer_;
   Blob<Dtype> bias_multiplier_;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4bb1d1e6..54796ae8 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -298,12 +298,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
   }
   
   for (int g = 0; g < group_; ++g) {
-    /*caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)0., output + output_offset_ * g);
-    */
-    //printf("weights.count() = %d, col_buff.count() = %d, output = %d\n", weights.count(), col_buff.count(), output.count());   
     caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
           conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
         (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
@@ -311,12 +305,31 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
    }
 }
 
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
+    const Dtype* weights, Dtype* output, bool skip_im2col) {
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    if (!skip_im2col) {
+      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+    }   
+    col_buff = col_buffer_.gpu_data();
+  }
+  
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
+          conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+        (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
+        (Dtype)0., output,  top_offset_+output_offset_ * g); 
+   }   
+}
+
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
     const Dtype* bias) {
-  /*caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
-      (Dtype)1., output);*/
      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
           height_out_*width_out_, 1, (Dtype)1., bias, 0,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 77697023..9e863322 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -119,6 +119,42 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
 
 }
 
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    //int col_offset = K_ * N_;
+    //int top_offset = M_ * N_;
+    //int weight_offset = M_ * K_;
+    int opt_num2 = global_packing_N;
+
+    for (int n = 0; n < this->num_; ++n) {
+      opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
+       //two intermediate variables to pass offset
+      this->top_offset_ = M_ * N_ * opt_num2;
+      this->col_offset_ = K_ * N_ * opt_num2;
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->forward_gpu_gemm_opt(bottom_data, weight,
+            top_data);
+
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+          this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
+
+  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  CHECK_BLOB_DATA(top[0],20, "top[0]");
+
+}
+
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {

From f3cd44851df30aecad4d081b32ba252f7d222ffd Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Mon, 3 Aug 2015 16:05:17 -0700
Subject: [PATCH 022/124] conv opt forward done

---
 include/caffe/util/im2col.hpp        |  2 +-
 include/caffe/vision_layers.hpp      | 27 +++++++++-----
 src/caffe/layers/base_conv_layer.cpp | 55 +++++++++++++++++++++-------
 src/caffe/layers/conv_layer.cpp      | 14 ++++---
 src/caffe/util/im2col.cpp            |  7 ++--
 5 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 862a539b..5eb28f9a 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -53,7 +53,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
     const int stride, Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
-void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_col, const int col_offset, int optnum);
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index e763d31a..6ba4bfc5 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -59,6 +59,9 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
       weights);
   void backward_gpu_bias(Dtype* bias, const Dtype* input);
+  void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
+      Dtype* output, bool skip_im2col = false);
+  void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
 #endif
 
   // reverse_dimensions should return true iff we are implementing deconv, so
@@ -99,12 +102,16 @@ class BaseConvolutionLayer : public Layer<Dtype> {
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
   }
   inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) {
-     im2col_gpu(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
+     im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
   }
   inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col2im_opt_kernel, (Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
+    col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
+}
+  inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
+    transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
+}
 #endif
 
   int conv_out_channels_;
@@ -113,11 +120,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   int conv_in_height_;
   int conv_in_width_;
   int kernel_dim_;
-  int weight_offset_;
-  int col_offset_;
-  int output_offset_;
-  int M_, N_, K_;
-  int opt_num2;
 
   Blob<Dtype> col_buffer_;
   Blob<Dtype> bias_multiplier_;
@@ -132,12 +134,15 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel;
   cl_kernel oclmem_kernel;
   cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
+  int opt_num2;
+  int M_, N_, K_;
+  int weight_offset_;
+  int col_offset_;
+  int output_offset_;
+  int top_offset_, top_offset_n, bottom_offset_;
 public:
   static cl_mem subTopMem, transMem;
   static size_t subtop_mem_size, trans_mem_size;
-
-public:
-  size_t top_offset_, bottom_offset_;
 };
 
 /**
@@ -210,6 +215,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 54796ae8..c6f24064 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -307,23 +307,40 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
-    const Dtype* weights, Dtype* output, bool skip_im2col) {
+    const Dtype* weight, Dtype* output, bool skip_im2col) {
   const Dtype* col_buff = input;
+  cl_command_queue Queue;
+  cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+      conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data());
     }   
-    col_buff = col_buffer_.gpu_data();
+    //col_buff = col_buffer_.gpu_data();
   }
-  
-  for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
-          conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
-        (Dtype)0., output,  top_offset_+output_offset_ * g); 
-   }   
+ 
+#ifdef multiQ
+    for (int g = 0; g < group_; ++g) {
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+          (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+       }
+     if(group_ == 2){
+       clFinish(amdDevice.CommandQueue);
+       clFinish(amdDevice.CommandQueue_helper);
+     }
+#else
+    Queue = amdDevice.CommandQueue;
+    for (int g = 0; g < group_; ++g) {
+       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+          (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+       }
+#endif
+
+   conv_transform_gpu((Dtype*)subTopMem, output);
+
 }
 
 
@@ -336,6 +353,16 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
           (Dtype)1., output, top_offset_);
 }
 
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
+    const Dtype* bias) {
+   for (int z = 0; z < opt_num2; z++)
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
+          N_, 1, (Dtype)1., bias, 0,
+          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+          (Dtype)1., output, top_offset_n + num_output_ * N_ * z);
+}
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     const Dtype* weights, Dtype* input) {
@@ -413,7 +440,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
     col_offset = K_ * N_ * opt_num2;
     //step1: packed im2col, col_size = (K_ * group_ ) * N_
     //this should be opt_num2 images packing together.
-    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+    im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
     //step 2: sgemm: Top (subTopMem) = weight * col_data
@@ -496,7 +523,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
     col_offset = K_ * (N_ * opt_num2);
     //step1: packed im2col, col_size = (K_ * group_ ) * N_
     //this should be opt_num2 images packing together.
-    im2col_opt_gpu(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+    im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
     //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 9e863322..7a763dfb 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -132,20 +132,22 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
     //int col_offset = K_ * N_;
     //int top_offset = M_ * N_;
     //int weight_offset = M_ * K_;
-    int opt_num2 = global_packing_N;
+    this->opt_num2 = global_packing_N;
 
-    for (int n = 0; n < this->num_; ++n) {
-      opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
+    for (int n = 0; n < this->num_; n += this->opt_num2) {
+      this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
        //two intermediate variables to pass offset
-      this->top_offset_ = M_ * N_ * opt_num2;
-      this->col_offset_ = K_ * N_ * opt_num2;
+      this->top_offset_ = this->M_ * this->N_ * this->opt_num2;
+      this->top_offset_n = top[i]->offset(n);
+      this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
       this->bottom_offset_ = bottom[i]->offset(n);
+      this->weight_offset_ = this->M_ * this->K_;
       this->forward_gpu_gemm_opt(bottom_data, weight,
             top_data);
 
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-          this->forward_gpu_bias(top_data, bias);
+          this->forward_gpu_bias_opt(top_data, bias);
       }
     }
   }
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index b9257675..4d28ab1e 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -285,7 +285,7 @@ template void im2col_16_gpu<double>(cl_kernel Kernel, const double* data_im, con
     const int stride, double* data_col, const int col_offset);
 
 template <typename Dtype>
-void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_col, const int col_offset, int optnum) {
 
@@ -315,10 +315,10 @@ void im2col_opt_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void im2col_opt_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
+template void im2col_gpu_opt<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, float* data_col, const int col_offset, int optnum);
-template void im2col_opt_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
+template void im2col_gpu_opt<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, double* data_col, const int col_offset,  int optnum);
 
@@ -384,7 +384,6 @@ void im2col_gpu_ocl(cl_mem data_im, const int channels,
     ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col);
     OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) );
 
-    //std::cout<<"num_kernels"<<num_kernels<<" data_im"<<data_im<<" height"<<height<<" width"<<width<<" ksize"<<ksize<<" pad"<<pad<<" stride"<<stride<<" height_col"<<height_col<<" width_col"<<width_col<<" data_col"<<data_col<<std::endl;
     if(ret!=CL_SUCCESS){
         fprintf(stderr,"Failed to Set Args\n");
     }

From e95fd84c2fe11683efb747f0736b43a80cd6f008 Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Mon, 3 Aug 2015 17:52:24 -0700
Subject: [PATCH 023/124] conf opt backward interfaces

---
 include/caffe/vision_layers.hpp      | 18 +++++--
 src/caffe/layers/base_conv_layer.cpp | 66 +++++++++++++++++++++++--
 src/caffe/layers/conv_layer.cpp      | 73 ++++++++++++++++++++--------
 3 files changed, 130 insertions(+), 27 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 6ba4bfc5..233bf48f 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -53,15 +53,19 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 #ifndef CPU_ONLY
   void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
       Dtype* output, bool skip_im2col = false);
+  void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
+      Dtype* output, bool skip_im2col = false);
   void forward_gpu_bias(Dtype* output, const Dtype* bias);
+  void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
   void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
       Dtype* col_output);
+  void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
+      Dtype* col_output);
   void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
       weights);
+  void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype*
+      weights);
   void backward_gpu_bias(Dtype* bias, const Dtype* input);
-  void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
 #endif
 
   // reverse_dimensions should return true iff we are implementing deconv, so
@@ -111,6 +115,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 }
   inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
     transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
+}
+ inline void conv_transpose_gpu(const Dtype* data){
+    opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
+}
+  inline void ocl_memset(Dtype* data, Dtype value, int count) {
+    ocl_memset(oclmem_kernel, data, value, count);
 }
 #endif
 
@@ -219,6 +229,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index c6f24064..ebd65713 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -381,6 +381,38 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
   }
 }
 
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
+    const Dtype* weights, Dtype* input) {
+  //Dtype* col_buff = col_buffer_.mutable_gpu_data();
+  cl_command_queue Queue;
+  if (is_1x1_) {
+    (Dtype*)transMem = input;
+  }
+  for (int g = 0; g < group_; ++g) {
+#ifdef multiQ
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+#else
+       Queue =  amdDevice.CommandQueue;
+#endif
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_,
+          (Dtype)1., weights,  weight_offset_ * g,
+          (Dtype*)subTopMem, top_offset_ * g,
+          (Dtype)0., (Dtype*)transMem, col_offset_ * g);
+      }
+#ifdef multiQ
+   if(group_ ==2){
+      clFinish(amdDevice.CommandQueue);
+      clFinish(amdDevice.CommandQueue_helper);
+    }
+#endif
+
+  if (!is_1x1_) {
+      conv_col2im_gpu_opt((Dtype*)transMem, input);
+  }
+}
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
     const Dtype* output, Dtype* weights) {
@@ -390,16 +422,42 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
     col_buff = col_buffer_.gpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-   /* caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
-        (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);*/
       caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_,
         (Dtype)1., output, top_offset_,
         (Dtype*)col_buff, col_offset_ * g, (Dtype)1.,
         (Dtype*)weights, weight_offset_ * g);
  }
 }
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
+    const Dtype* output, Dtype* weights) {
+  const Dtype* col_buff = input;
+  cl_command_queue Queue;
+  if (!is_1x1_) {
+    conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data());
+    //col_buff = col_buffer_.gpu_data();
+  }
+    conv_transpose_gpu(output);
+
+  for (int g = 0; g < group_; ++g) {
+#ifdef multiQ
+       if(g == 0) Queue = amdDevice.CommandQueue;
+       else Queue =  amdDevice.CommandQueue_helper;
+#else
+       Queue =  amdDevice.CommandQueue;
+#endif
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2,
+        (Dtype)1., (Dtype*)subTopMem, top_offset_ * g,
+        (Dtype*)transMem, col_offset_ * g, (Dtype)1.,
+        (Dtype*)weights, weight_offset_ * g);
+#ifdef multiQ
+     if(group_ == 2){
+       clFinish(amdDevice.CommandQueue);
+       clFinish(amdDevice.CommandQueue_helper);
+     }
+#endif
+    }
+}
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 7a763dfb..34490f68 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -77,7 +77,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt(bottom, top);
+   Forward_gpu_opt2(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
@@ -97,22 +97,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
   const Dtype* weight = this->blobs_[0]->gpu_data();
   this->forward_gpu_opt(bottom, weight, top);
 
-/*
-#ifdef check_gradient
-   const Dtype *cpu_bottom_data = bottom[0]->cpu_data();   Dtype *cpu_top_data = (Dtype*)(*top)[0]->cpu_data();
-
-   printf("\n\nbottom data GPU:\n");
-   for(int i=0; i<channels_*height_*width_; i+=1000){
-       printf("%f,",cpu_bottom_data[i]);
-       if(i%16==15) printf("\n");
-   }
-  printf("\n\ntop data GPU:\n");
-   for(int i=0; i<M_org*N_*num_; i+=100000){
-       printf("%f,",cpu_top_data[i]);
-      if(i%16==15) printf("\n");
-   }
-  printf("\n\n");#endif
-*/
 #ifdef Track_layer
   LOG(WARNING) << "conv fp done";
 #endif
@@ -129,9 +113,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
      //CHECK_BLOB_DATA(bottom[i],10,"bottom");
 
     Dtype* top_data = top[i]->mutable_gpu_data();
-    //int col_offset = K_ * N_;
-    //int top_offset = M_ * N_;
-    //int weight_offset = M_ * K_;
     this->opt_num2 = global_packing_N;
 
     for (int n = 0; n < this->num_; n += this->opt_num2) {
@@ -183,11 +164,63 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
   CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
+
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
       this->backward_gpu_opt(top, propagate_down, bottom);
 }
+
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      this->ocl_memset(bias_diff, 0., this->blobs_[1]->count());
+      for (int n = 0; n < this->num_; ++n) {
+       //
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      this->weight_offset_ = this->M_ * this->K_;
+      this->opt_num2 = global_packing_N;
+      for (int n = 0; n < this->num_; ++n) {
+        this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
+        this->top_offset_n = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+        this->top_offset_ = this->M_ * (this->N * this->opt_num2);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm_opt(bottom_data,
+              top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm_opt(top_diff, weight,
+              bottom_diff);
+        }
+      }
+    }
+  }
+
+  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
+  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
+  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
+}
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

From 2fdb29af929e7023f520a2cd55be506a1d7b1b9d Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 4 Aug 2015 14:09:03 +0800
Subject: [PATCH 024/124] finished debugging for conv optimized scheme

---
 include/caffe/vision_layers.hpp      | 3 ++-
 src/caffe/layers/base_conv_layer.cpp | 3 ++-
 src/caffe/layers/conv_layer.cpp      | 4 ++--
 src/caffe/util/math_functions.cpp    | 4 ++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 233bf48f..6f306545 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -119,7 +119,8 @@ class BaseConvolutionLayer : public Layer<Dtype> {
  inline void conv_transpose_gpu(const Dtype* data){
     opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 }
-  inline void ocl_memset(Dtype* data, Dtype value, int count) {
+protected:
+  inline void gpu_memset(Dtype* data, Dtype value, int count) {
     ocl_memset(oclmem_kernel, data, value, count);
 }
 #endif
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ebd65713..99643465 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -387,7 +387,8 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
   //Dtype* col_buff = col_buffer_.mutable_gpu_data();
   cl_command_queue Queue;
   if (is_1x1_) {
-    (Dtype*)transMem = input;
+    int count = height_ * width_ * conv_in_channels_ * opt_num2;
+    caffe_gpu_copy(count, input, (Dtype*)transMem);
   }
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 34490f68..2dc65ac7 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -183,7 +183,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
     // Bias gradient, if necessary.
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      this->ocl_memset(bias_diff, 0., this->blobs_[1]->count());
+      this->gpu_memset(bias_diff, 0., this->blobs_[1]->count());
       for (int n = 0; n < this->num_; ++n) {
        //
         this->top_offset_ = top[i]->offset(n);
@@ -201,7 +201,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
         this->top_offset_n = top[i]->offset(n);
         this->bottom_offset_ = bottom[i]->offset(n);
         this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_ = this->M_ * (this->N * this->opt_num2);
+        this->top_offset_ = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           this->weight_gpu_gemm_opt(bottom_data,
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 8bc16ea3..b877da50 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -241,13 +241,13 @@ void caffe_copy<double>(const int N, const double* X, double* Y) {
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
   if(X != Y)
-      CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+      CLBLAS_CHECK( clblasScopy( N * sizeof(float), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
   if(X != Y)
-      CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+      CLBLAS_CHECK( clblasDcopy( N * sizeof(double), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>

From 9a416709ac27ab960be180f5eba9689789e78209 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 4 Aug 2015 14:19:12 +0800
Subject: [PATCH 025/124] minor change

---
 src/caffe/layers/conv_layer.cpp   | 2 +-
 src/caffe/util/math_functions.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 2dc65ac7..342d7842 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -77,7 +77,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt2(bottom, top);
+   Forward_gpu_opt(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index b877da50..8bc16ea3 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -241,13 +241,13 @@ void caffe_copy<double>(const int N, const double* X, double* Y) {
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
   if(X != Y)
-      CLBLAS_CHECK( clblasScopy( N * sizeof(float), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+      CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
   if(X != Y)
-      CLBLAS_CHECK( clblasDcopy( N * sizeof(double), (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+      CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>

From 472b84a5ac9aed901cd2f396c8a89f90a897288e Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Wed, 5 Aug 2015 13:34:43 -0700
Subject: [PATCH 026/124] conv layer clean up

---
 include/caffe/vision_layers.hpp |  9 +++++----
 src/caffe/layers/conv_layer.cpp | 14 ++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 6f306545..8498cb58 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -125,6 +125,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 }
 #endif
 
+private:
   int conv_out_channels_;
   int conv_in_channels_;
   int conv_out_spatial_dim_;
@@ -224,12 +225,12 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
+  //virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
+    //  const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  //virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+    //  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 342d7842..1037a8cf 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -77,7 +77,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt(bottom, top);
+   Forward_gpu_opt2(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
@@ -86,11 +86,12 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     if (use_packing_scheme && global_packing_N >1)
-      Backward_gpu_opt(top, propagate_down, bottom);
+      Backward_gpu_opt2(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
 }
 
+/*
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -101,7 +102,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
   LOG(WARNING) << "conv fp done";
 #endif
 
-}
+}*/
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
@@ -117,7 +118,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
 
     for (int n = 0; n < this->num_; n += this->opt_num2) {
       this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
-       //two intermediate variables to pass offset
+       //intermediate variables to pass offset
       this->top_offset_ = this->M_ * this->N_ * this->opt_num2;
       this->top_offset_n = top[i]->offset(n);
       this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
@@ -133,7 +134,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
     }
   }
 
-  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
   CHECK_BLOB_DATA(top[0],20, "top[0]");
 
 }
@@ -165,12 +166,13 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
+/*
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
       this->backward_gpu_opt(top, propagate_down, bottom);
 }
-
+*/
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,

From 649b3abe716c281aef9e6d141c9c8cf4fb8c812c Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 6 Aug 2015 04:52:32 +0800
Subject: [PATCH 027/124] fixed the bug in syncedmem set_cpu_data

---
 include/caffe/util/math_functions.hpp    |  2 +-
 src/caffe/OCL_kernel.cl                  |  1 +
 src/caffe/data_transformer.cpp           |  5 ++-
 src/caffe/layers/base_conv_layer.cpp     |  4 +--
 src/caffe/layers/base_data_layer.cpp     | 42 ++++++++++++++++--------
 src/caffe/layers/inner_product_layer.cpp |  2 +-
 src/caffe/net.cpp                        |  1 +
 src/caffe/syncedmem.cpp                  | 12 +++++--
 src/caffe/util/math_functions.cpp        |  4 +--
 9 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index c9a391ac..a5ca6470 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -55,7 +55,7 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
     Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_gemvv(const CBLAS_TRANSPOSE TransA, const int M,
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
     const Dtype * x, size_t offx, const Dtype beta, int incx,
     Dtype* y, size_t offy, int incy);
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 48076725..8ab1c711 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -742,6 +742,7 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size)
                 buffer[gdx] = value;    
         }
 }
+template __attribute__ ((mangled_name(oclmem))) __kernel void OCL_memset2(__global int* buffer, const int value, const int size);
 
 template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 2a3bc645..f6d80dc2 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -7,7 +7,7 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
-
+#include "caffe/util/benchmark.hpp"
 namespace caffe {
 
 template<typename Dtype>
@@ -24,7 +24,6 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
     ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
     data_mean_.FromProto(blob_proto);
   }
-  printf("before if\n");
   // check if we want to use mean_value
   if (param_.mean_value_size() > 0) {
     CHECK(param_.has_mean_file() == false) <<
@@ -33,7 +32,6 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
       mean_values_.push_back(param_.mean_value(c));
     }
   }
-  printf("reaches here\n");
 }
 
 template<typename Dtype>
@@ -127,6 +125,7 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
+
   // If datum is encoded, decoded and transform the cv::image.
   if (datum.encoded()) {
     CHECK(!(param_.force_color() && param_.force_gray()))
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 99643465..55046847 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -465,7 +465,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
  /* caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
       input, bias_multiplier_.gpu_data(), 1., bias);*/
-      caffe_gpu_gemvv<Dtype>(CblasNoTrans, num_output_, height_out_*width_out_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_*width_out_,
           (Dtype)1., input, top_offset_, height_out_*width_out_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias, (size_t)0, 1);
@@ -553,7 +553,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count());
     for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemvv<Dtype>(CblasNoTrans, M_, N_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, M_, N_,
           (Dtype)1., top_diff, top[i]->offset(n), N_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias_diff, (size_t)0, 1);
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 60dfde75..1b6e07fa 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -60,7 +60,13 @@ template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   // First, join the thread
+  CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer;
+  join_prefetch_timer.Start();
   JoinPrefetchThread();
+  join_prefetch_timer.Stop();
+  printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds());
+
+  forward_timer.Start();
   DLOG(INFO) << "Thread joined";
   // Reshape to loaded data.
   top[0]->ReshapeLike(prefetch_data_);
@@ -75,37 +81,42 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
                top[1]->mutable_cpu_data());
   }
-
-  CHECK_BLOB_DATA(top[0], 20, "top[0]");
-
+  forward_timer.Stop();
+  printf("write buffer time: %f\n", forward_timer.MilliSeconds());
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
+  create_prefetch_timer.Start();
   CreatePrefetchThread();
-
+  create_prefetch_timer.Stop();
+  printf("create prefetch time: %f\n", create_prefetch_timer.MilliSeconds() );
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const  vector<Blob<Dtype>*>& top) {
-  printf("HHHHHH Data forward time: n\n");
-  // First, join the thread
+  CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer;
+  
+  join_prefetch_timer.Start();
   JoinPrefetchThread();
-  CPUTimer forward_timer;
-  forward_timer.Start();
-
+  join_prefetch_timer.Stop();
+  printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds());
   // Copy the data from prefetch thread to data_layer
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
-   top[0]->ReshapeLike(prefetch_data_);
-    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
+  
+    clFinish(amdDevice.CommandQueue);
+       forward_timer.Start();
+   top[0]->ReshapeLike(this->prefetch_data_);
+   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
   if (this->output_labels_) {
        // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
+   top[1]->ReshapeLike(prefetch_label_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
    }
-  clFinish(amdDevice.CommandQueue);
+  
+//  clFinish(amdDevice.CommandQueue);
   forward_timer.Stop();
-  printf("Data forward time: %f\n\n", forward_timer.MilliSeconds());
+  printf("Write buffer time: %f\n\n", forward_timer.MilliSeconds());
 
  
 #ifdef Track_data_transfer
@@ -115,7 +126,10 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
 
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
+  create_prefetch_timer.Start();
   CreatePrefetchThread();
+  create_prefetch_timer.Stop();
+  printf("create_prefetch time: %f\n", create_prefetch_timer.MilliSeconds());
   //return Dtype(0.);
 }
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 4242afa3..676650c2 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -148,7 +148,7 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bias
-    caffe_gpu_gemvv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
+    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
         (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
          (size_t)0, (Dtype)0., 1,
         this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 4d20cdd7..df376ff0 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -35,6 +35,7 @@ Net<Dtype>::Net(const string& param_file, Phase phase) {
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
   // Set phase from the state.
+  amdDevice.Init();
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
   // the current NetState.
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index e98e6847..a44641ef 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -36,7 +36,7 @@ if (cpu_ptr_ && own_cpu_data_) {
 
 void SyncedMemory::ocl_setup() {
   cl_int err=0;
-  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "memset", &err);
   OCL_CHECK(err);
 }
 
@@ -125,7 +125,7 @@ const void* SyncedMemory::cpu_data() {
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-CHECK(data);
+/*CHECK(data);
   if (own_cpu_data_) {
   OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL));
   OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_));
@@ -135,6 +135,14 @@ CHECK(data);
   cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
   head_ = HEAD_AT_CPU;
   own_cpu_data_ = false;
+*/
+  CHECK(data);
+  if (own_cpu_data_) {
+    CaffeFreeHost(cpu_ptr_);
+  }
+  cpu_ptr_ = data;
+  head_ = HEAD_AT_CPU;
+  own_cpu_data_ = false;
 }
 
 const void* SyncedMemory::gpu_data() {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 8bc16ea3..f4ac6617 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -137,7 +137,7 @@ void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
 }
 
 template <>
-void caffe_gpu_gemvv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const float alpha, const float* A, size_t offA, int lda, 
     const float* x, size_t offx, const float beta, int incx, 
     float* y, size_t offy, int incy) {
@@ -150,7 +150,7 @@ void caffe_gpu_gemvv<float>(const CBLAS_TRANSPOSE TransA, const int M,
 }
 
 template <>
-void caffe_gpu_gemvv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const double alpha, const double* A, size_t offA, int lda,
     const double* x, size_t offx, const double beta, int incx,
     double* y, size_t offy, int incy) {

From b204a85ca226acee4c63bb16fb50ceb8cf324730 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 6 Aug 2015 11:13:11 +0800
Subject: [PATCH 028/124] gconv opt debug

---
 include/caffe/vision_layers.hpp      | 12 ++++++------
 src/caffe/OCL_kernel.cl              |  1 -
 src/caffe/layers/base_conv_layer.cpp | 10 +++-------
 src/caffe/layers/conv_layer.cpp      | 14 ++++----------
 src/caffe/net.cpp                    |  2 +-
 src/caffe/syncedmem.cpp              |  2 +-
 src/caffe/util/math_functions.cpp    |  2 --
 7 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 8498cb58..336127d5 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -105,11 +105,11 @@ class BaseConvolutionLayer : public Layer<Dtype> {
     col2im_gpu(col2im_gpu_kernel, col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
   }
-  inline void conv_im2col_gpu_opt(const Dtype* data, Dtype* col_buff) {
+  inline void conv_im2col_gpu_opt(const Dtype* data) {
      im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
   }
-  inline void conv_col2im_gpu_opt(const Dtype* col_buff, Dtype* data) {
+  inline void conv_col2im_gpu_opt( Dtype* data) {
     col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 }
@@ -225,12 +225,12 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  //virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-    //  const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  //virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-    //  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
index 8ab1c711..48076725 100644
--- a/src/caffe/OCL_kernel.cl
+++ b/src/caffe/OCL_kernel.cl
@@ -742,7 +742,6 @@ __kernel void OCL_memset2(__global int* buffer, const int value, const int size)
                 buffer[gdx] = value;    
         }
 }
-template __attribute__ ((mangled_name(oclmem))) __kernel void OCL_memset2(__global int* buffer, const int value, const int size);
 
 template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 55046847..8edecdc0 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -308,16 +308,14 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     const Dtype* weight, Dtype* output, bool skip_im2col) {
-  const Dtype* col_buff = input;
   cl_command_queue Queue;
   cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data());
+      conv_im2col_gpu_opt(input);
     }   
     //col_buff = col_buffer_.gpu_data();
   }
- 
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
        if(g == 0) Queue = amdDevice.CommandQueue;
@@ -338,9 +336,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
           (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
        }
 #endif
-
    conv_transform_gpu((Dtype*)subTopMem, output);
-
 }
 
 
@@ -410,7 +406,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #endif
 
   if (!is_1x1_) {
-      conv_col2im_gpu_opt((Dtype*)transMem, input);
+      conv_col2im_gpu_opt(input);
   }
 }
 
@@ -435,7 +431,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   const Dtype* col_buff = input;
   cl_command_queue Queue;
   if (!is_1x1_) {
-    conv_im2col_gpu_opt(input, col_buffer_.mutable_gpu_data());
+    conv_im2col_gpu_opt(input);
     //col_buff = col_buffer_.gpu_data();
   }
     conv_transpose_gpu(output);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 1037a8cf..48b7afe9 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -77,7 +77,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt2(bottom, top);
+   Forward_gpu_opt(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
@@ -86,12 +86,11 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     if (use_packing_scheme && global_packing_N >1)
-      Backward_gpu_opt2(top, propagate_down, bottom);
+      Backward_gpu_opt(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
 }
 
-/*
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -102,12 +101,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
   LOG(WARNING) << "conv fp done";
 #endif
 
-}*/
+}
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-
   const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -115,7 +113,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
 
     Dtype* top_data = top[i]->mutable_gpu_data();
     this->opt_num2 = global_packing_N;
-
     for (int n = 0; n < this->num_; n += this->opt_num2) {
       this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
        //intermediate variables to pass offset
@@ -126,12 +123,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
       this->weight_offset_ = this->M_ * this->K_;
       this->forward_gpu_gemm_opt(bottom_data, weight,
             top_data);
-
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
           this->forward_gpu_bias_opt(top_data, bias);
       }
-    }
+   }
   }
 
   CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
@@ -166,13 +162,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-/*
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
       this->backward_gpu_opt(top, propagate_down, bottom);
 }
-*/
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index df376ff0..ad6bdc7e 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -35,7 +35,7 @@ Net<Dtype>::Net(const string& param_file, Phase phase) {
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
   // Set phase from the state.
-  amdDevice.Init();
+  //amdDevice.Init();
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
   // the current NetState.
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index a44641ef..ac1187b9 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -36,7 +36,7 @@ if (cpu_ptr_ && own_cpu_data_) {
 
 void SyncedMemory::ocl_setup() {
   cl_int err=0;
-  oclmem_kernel = clCreateKernel(amdDevice.Program, "memset", &err);
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
   OCL_CHECK(err);
 }
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index f4ac6617..54e0abdc 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -102,7 +102,6 @@ cl_event caffe_gpu_gemm<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE Tr
     int lda = (TransA == CblasNoTrans) ? K : M;
     int ldb = (TransB == CblasNoTrans) ? N : K;
     int ldc = N;
-    //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) );
     CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
     return event;
  }
@@ -117,7 +116,6 @@ cl_event caffe_gpu_gemm<double>(cl_command_queue *queue, const CBLAS_TRANSPOSE T
     int lda = (TransA == CblasNoTrans) ? K : M;
     int ldb = (TransB == CblasNoTrans) ? N : K;
     int ldc = N;
-    //AMDBLAS_CHECK( clAmdBlasSgemmEx(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, NULL) );
     CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
     return event;
 }

From 4c4b9d33eade5699a5c16870cd4c06ecdfd6bdd0 Mon Sep 17 00:00:00 2001
From: Yuan <Yuan.Gao@noplz.name>
Date: Sat, 8 Aug 2015 05:18:22 +0800
Subject: [PATCH 029/124] Split OpenCL kernels into different files

---
 include/caffe/device.hpp               |    4 +-
 src/caffe/OCL_kernel.cl                | 1837 ------------------------
 src/caffe/device.cpp                   |   29 +-
 src/caffe/ocl/OCL_kernel.cl            |  999 +++++++++++++
 src/caffe/ocl/dropout_layer.cl         |   18 +
 src/caffe/ocl/im2col.cl                |  298 ++++
 src/caffe/ocl/lrn_layer.cl             |  113 ++
 src/caffe/ocl/pooling_layer.cl         |  267 ++++
 src/caffe/ocl/relu_layer.cl            |   20 +
 src/caffe/ocl/softmax_layer.cl         |   48 +
 src/caffe/ocl/softmaxwithloss_layer.cl |   65 +
 11 files changed, 1852 insertions(+), 1846 deletions(-)
 delete mode 100644 src/caffe/OCL_kernel.cl
 create mode 100644 src/caffe/ocl/OCL_kernel.cl
 create mode 100644 src/caffe/ocl/dropout_layer.cl
 create mode 100644 src/caffe/ocl/im2col.cl
 create mode 100644 src/caffe/ocl/lrn_layer.cl
 create mode 100644 src/caffe/ocl/pooling_layer.cl
 create mode 100644 src/caffe/ocl/relu_layer.cl
 create mode 100644 src/caffe/ocl/softmax_layer.cl
 create mode 100644 src/caffe/ocl/softmaxwithloss_layer.cl

diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 7360dacd..0b534e57 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -26,13 +26,13 @@ class Device{
     
      
     cl_int Init(); 
-    cl_int ConvertToString(const char *pFileName,std::string &Str);
+    cl_int ConvertToString(std::string pFileName,std::string &Str);
     void DisplayPlatformInfo();
     void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
 
     void GetDeviceInfo();
     
-    cl_program BuildProgram(const char*);    
+    cl_program BuildProgram(std::string);    
 
     template <typename T>
     void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str);
diff --git a/src/caffe/OCL_kernel.cl b/src/caffe/OCL_kernel.cl
deleted file mode 100644
index 48076725..00000000
--- a/src/caffe/OCL_kernel.cl
+++ /dev/null
@@ -1,1837 +0,0 @@
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-
-//beginning of the looooooong gpu_random_generator kernel 
-//we use the open sourced threefry's GPU implementation
-typedef uint uint32_t;
-
-struct r123array4x32 {	uint32_t v[4]; };
-
-enum r123_enum_threefry32x4 
-{
-	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
-	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
-	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
-	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
-	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
-	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
-	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
-	R_32x4_7_0 = 18, R_32x4_7_1 = 20
-};
-
-inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
-inline uint32_t RotL_32(uint32_t x, unsigned int N)
-{
-	return (x << (N & 31)) | (x >> ((32 - N) & 31));
-}
-
-typedef struct r123array4x32 threefry4x32_ctr_t;
-typedef struct r123array4x32 threefry4x32_key_t;
-typedef struct r123array4x32 threefry4x32_ukey_t;
-
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
-{
-	threefry4x32_ctr_t	X;
-	uint32_t			ks[4 + 1];
-	int					i;
-	ks[4] = 0x1BD11BDA;
-	/*
-	for (i = 0; i < 4; i++)
-	{
-		ks[i] = k.v[i];
-		X.v[i] = in.v[i];
-		ks[4] ^= k.v[i];
-	}*/ 
-	{
-		ks[0] = k.v[0];
-		X.v[0] = in.v[0];
-		ks[4] ^= k.v[0];
-
-		ks[1] = k.v[1];
-		X.v[1] = in.v[1];
-		ks[4] ^= k.v[1];
-
-		ks[2] = k.v[2];
-		X.v[2] = in.v[2];
-		ks[4] ^= k.v[2];
-
-		ks[3] = k.v[3];
-		X.v[3] = in.v[3];
-		ks[4] ^= k.v[3];
-	}
-	X.v[0] += ks[0];
-	X.v[1] += ks[1];
-	X.v[2] += ks[2];
-	X.v[3] += ks[3];
-	if (Nrounds > 0) 
-	{
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 1) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 2) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 3) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 3) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 1;
-	} if (Nrounds > 4) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 5) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 6) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 7) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 7) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 2;
-	} if (Nrounds > 8) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 9) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 10) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 11) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 11) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 3;
-	} if (Nrounds > 12) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 13) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 14) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 15) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 15) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 4;
-	} if (Nrounds > 16) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 17) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 18) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 19) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 19) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 5;
-	} if (Nrounds > 20) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 21) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 22) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 23) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 23) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 6;
-	} if (Nrounds > 24) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 25) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 26) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 27) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 27) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 7;
-	} if (Nrounds > 28) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 29) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 30) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 31) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 31) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 8;
-	} if (Nrounds > 32) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 33) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 34) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 35) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 35) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 9;
-	} if (Nrounds > 36) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 37) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 38) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 39) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 39) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 10;
-	} if (Nrounds > 40) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 41) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 42) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 43) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 43) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 11;
-	} if (Nrounds > 44) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 45) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 46) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 47) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 47) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 12;
-	} if (Nrounds > 48) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 49) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 50) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 51) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 51) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 13;
-	} if (Nrounds > 52) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 53) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 54) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 55) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 55) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 14;
-	} if (Nrounds > 56) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 57) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 58) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 59) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 59) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 15;
-	} if (Nrounds > 60) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 61) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 62) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 63) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 63) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 16;
-	} if (Nrounds > 64) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 65) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 66) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 67) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 67) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 17;
-	} if (Nrounds > 68) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 69) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 70) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 71) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 71) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 18;
-	} 
-	return X;
-} 
-
-template <class T>
-__kernel void PRNG_threefry4x32(
-        __global uint4 *randomnumber,
-        threefry4x32_ctr_t ctr_i,
-        T inf,
-        T sup,
-        T threshold,
-        uint nrounds,
-        uint numrandom
-){
-        size_t  gdx = get_global_id(0);
-
-        uint maxUint = 0;
-        maxUint--;
-        float r = (float)maxUint;
-
-        threefry4x32_ctr_t      ctr = ctr_i; 
-        threefry4x32_ukey_t ukey;
-
-        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
-
-        threefry4x32_ctr_t  random4;
-
-        if ( gdx < numrandom )
-        {
-                random4 = threefry4x32_R(nrounds, ctr, ukey);
-                uint4 frnd;
-				
-                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-				
-                randomnumber[gdx] = frnd;
-        }
-}
-
-
-template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
-
-template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
-
-//end of the looooooong gpu_random_generator kernel 
-
-
-template <class T>
-__kernel void OCL_memset(__global T* buffer, const T value, const int size){
-	int gdx = get_global_id(0);
-	if(gdx < size){
-		buffer[gdx] = value;	
-	}
-}
-
-template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
-template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
-
-__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
-        int gdx = get_global_id(0);
-        if(gdx < size){
-                buffer[gdx] = value;    
-        }
-}
-
-template <class T>
-__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
-     int gdx = get_global_id(0);
-     if(gdx < N){
-          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
-     }
-}
-
-template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
-template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
-
-template <class T>
-__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
-    int index=get_global_id(0);
-    data_im = data_im + img_offset;
-    data_col =  data_col + col_offset;
-    if(index < n){
-        int w_out=index %width_col;
-        index /= width_col;
-        int h_out=index%height_col;
-        int channel_in = index/height_col;
-        int channel_out=channel_in *ksize *ksize;
-        int h_in = h_out *stride-pad;
-        int w_in = w_out *stride-pad;
-        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-        data_im +=(channel_in * height + h_in) *width + w_in;
-        int i=0,j=0;
-        for(i=0;i<ksize;++i){
-            for(j=0;j<ksize;++j){
-                int h = h_in+i;
-                int w = w_in+j;
-                if(h >= 0 && w >= 0 && h < height && w < width)
-                    *data_col=data_im[i * width + j];
-                else *data_col=0;
-                data_col +=height_col *width_col;
-            }
-        }
-    }
-}
-
-template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
-template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); 
-
-template <class T>
-__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){
-
-    int index = get_global_id(0);
-
-    data_im = data_im + img_offset;
-    data_col = data_col + col_offset;
-
-    int x_out = index % width_col;
-    int y_out = (index / width_col) % height_col;
-    int channel_in = (index / width_col / height_col) % channels;
-    int channel_out = channel_in * ksize * ksize;
-    int im_id = index / width_col / height_col / channels;
-
-    int y_in = y_out * stride - pad;
-    int x_in = x_out * stride - pad;
-    int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
-    int offset_im = im_id * channels * height * width + channel_in * height * width;
-
-    for(int k_h = 0; k_h < ksize; k_h++){
-        for(int k_w = 0; k_w < ksize; k_w++){
-            int x_im = x_in + k_w;
-            int y_im = y_in + k_h;
-            int index_im = y_im * width + x_im;
-            int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
-            if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
-                data_col[offset_col + index_col] = data_im[offset_im + index_im];
-            else
-                data_col[offset_col + index_col] = 0;
-        }
-    }
-}
-
-template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
-template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
-
-
-template <class T>
-__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    __global T* data_col, const int col_offset) {
-    data_im = data_im + img_offset;
-    data_col = data_col + col_offset;     
-
-    int index = get_global_id(0);
-    if(index < n) {
-        int w_out = index % width_col;
-        int h_index = index / width_col;
-        int h_out = h_index % height_col;
-        int channel_in = h_index / height_col;
-        int channel_out = channel_in * kernel_h * kernel_w;
-        int h_in = h_out * stride_h - pad_h;
-        int w_in = w_out * stride_w - pad_w;
-        __global T* data_col_ptr = data_col;
-        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-        __global const T* data_im_ptr = data_im;
-        data_im_ptr += (channel_in * height + h_in) * width + w_in;
-        for (int i = 0; i < kernel_h; ++i) {
-            for (int j = 0; j < kernel_w; ++j) {
-                int h = h_in + i;
-                int w = w_in + j;
-                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-                            data_im_ptr[i * width + j] : 0;
-                data_col_ptr += height_col * width_col;
-        }
-    }
-  }
-}
-
-template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
-           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-           const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2col_gpu_double_kernel)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
-           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-           const int height_col, const int width_col, __global double* data_col, const int col_offset);
-
-template <class T>
-__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    __global T* data_im, const int img_offset) {
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-   int index = get_global_id(0);
-    if(index < n) {
-        T val = 0;
-        int w = index % width + pad_w;
-        int h = (index / width) % height + pad_h;
-        int c = index / (width * height);
-        // compute the start and end of the output
-        int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-        int w_col_end = min(w / stride_w + 1, width_col);
-        int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-        int h_col_end = min(h / stride_h + 1, height_col);
-        // equivalent implementation
-        int offset =
-            (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-        int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-            for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-            }
-        }
-        data_im[index] = val;
-  }
-}
-
-template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
-    									const int height, const int width, const int channels,
-    									const int patch_h, const int patch_w,const int pad_h, const int pad_w,
-    									const int stride_h, const int stride_w,const int height_col, const int width_col,
-    									__global float* data_im, const int img_offset);
-template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
-                                         const int col_offset, const int height, const int width, const int channels,
-                                         const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-                                         const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
-
-template <class T>
-__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){
-    int index = get_global_id(0);
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-    if(index < n){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height);
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
-}
-template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); 
-template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); 
-
-template <class T>
-__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index;index<n;index+=tmp){
-        int w_out=index %width_col;
-        index /= width_col;
-        int h_out=index%height_col;
-        int channel_in = index/height_col;
-        int channel_out=channel_in *ksize *ksize;
-        int h_in = h_out *stride-pad;
-        int w_in = w_out *stride-pad;
-        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-        data_im +=(channel_in * height + h_in) *width + w_in;
-        int i=0,j=0;
-        for(i=0;i<ksize;++i){
-            for(j=0;j<ksize;++j){
-                int h = h_in+i;
-                int w = w_in+j;
-                if(h >= 0 && w >= 0 && h < height && w < width)
-                    *data_col=data_im[i * width + j];
-                else *data_col=0;
-                data_col += height_col *width_col;
-            }
-        }
-    }
-}
-
-template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); 
-template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); 
-
-template <class T>
-__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){
-    int index = get_global_id(0);
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-    if(index < n){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height) % channels;
-      int im = index / width / height / channels;
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col * optnum);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
-}
-template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
-template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
-
-
-template <class T>
-__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < n; index += tmp){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height);
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
-}
-template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); 
-template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); 
-
-template <class T>
-__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){
-
-    int index = get_global_id(0);
-    data_opt = data_opt + opt_offset;
-    data_im = data_im + im_offset;
-    if(index < n){
-      int w = index % width;
-      int h = (index / width) % height;
-      int c = index / (width * height) % channels;
-      int im = index / width / height / channels;
-
-      int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
-      data_opt[opt_index] = data_im[index];
-    }
-}
-template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
-template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
-
-
-template <class T>
-__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
-     int index = get_global_id(0);
-     int tmp = get_global_size(0);
-     for(index; index < nthreads; index += tmp){
-         int pw = index % pooled_width;
-         int ph = (index / pooled_width) % pooled_height;
-         int c = (index / pooled_width / pooled_height) % channels;
-         int n = index / pooled_width / pooled_height / channels;
-         int hstart = ph * stride_h - pad_h;
-         int wstart = pw * stride_w - pad_w;
-         const int hend = min(hstart + kernel_h, height);
-         const int wend = min(wstart + kernel_w, width);
-         hstart = max(hstart, 0);
-         wstart = max(wstart, 0);
-        T maxval = -FLT_MAX;
-        int maxidx = -1;
-        bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-        for (int h = hstart; h < hend; ++h) {
-          for (int w = wstart; w < wend; ++w) {
-           if (bottom_data[h * width + w] > maxval) {
-             maxidx = h * width + w;
-             maxval = bottom_data[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
-}
-template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
-template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
-
-template <class T>
-__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < nthreads; index+=tmp){
-        int pw = index % pooled_width;
-        int ph = (index / pooled_width) % pooled_height;
-        int c = (index / pooled_width / pooled_height) % channels;
-        int n = index / pooled_width / pooled_height / channels;            int hstart = ph * stride_h - pad_h;            int wstart = pw * stride_w - pad_w;
-            int hend = min(hstart + kernel_h, height + pad_h);
-            int wend = min(wstart + kernel_w, width + pad_w);
-            const int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height);
-            wend = min(wend, width);
-            T aveval = 0;
-            bottom_data =
-                bottom_data + (n * channels + c) * height * width;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                aveval += bottom_data[h * width + w];
-              }
-            }
-            top_data[index] = aveval / pool_size;
-          }
-
-}
-template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
-template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
-
-template <class T>
-__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < nthreads; index+=tmp){
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    T cumsum = 0.;
-    bottom_data = bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_data[h * width + w];
-          return;
-        }
-      }
-    }
-    }
-}
-template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
-
-template <class T>
-__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < count; index+=tmp){
-    const int pw = index % pooled_width; 
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
-    T cumsum = FLT_MIN;
-    T cumvalues = 0.;
-    bottom_data =        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;  }
-}
-template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
-
-template <class T>
-__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
-    __global int* mask, __global T* top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, __global T* const bottom_diff) {
-     int index = get_global_id(0);
-     int total = get_global_size(0);
-     for(index; index < nthreads; index += total){
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    T gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    top_diff += offset;
-    if (mask) {
-      mask = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      top_mask = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
-template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
-
-template <class T>
-__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){
-     int index = get_global_id(0);
-     int total = get_global_size(0);
-     for(index; index < nthreads; index += total){
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = (index / width / height) % channels;
-            int n = index / width / height / channels;
-            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            const int phend = min(h / stride_h + 1, pooled_height);
-            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            const int pwend = min(w / stride_w + 1, pooled_width);
-            T gradient = 0;
-            top_diff += (n * channels + c) * pooled_height * pooled_width;
-            for (int ph = phstart; ph < phend; ++ph) {
-              for (int pw = pwstart; pw < pwend; ++pw) {
-                // figure out the pooling size
-                int hstart = ph * stride_h - pad_h;
-                int wstart = pw * stride_w - pad_w;
-                int hend = min(hstart + kernel_h, height + pad_h);
-                int wend = min(wstart + kernel_w, width + pad_w);
-                int pool_size = (hend - hstart) * (wend - wstart);
-                gradient += top_diff[ph * pooled_width + pw] / pool_size;
-              }
-    }
-    bottom_diff[index] = gradient;
-   }
-}
-
-template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
-template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
-
-template <class Dtype>
-void StoPoolBackward(const int nthreads,
-    __global Dtype* rand_idx, __global Dtype* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global Dtype* bottom_diff) {
-      int index = get_global_id(0);
-      int total = get_global_size(0);
-      for(index; index < nthreads; index += total){
-            // find out the local index
-            // find out the local offset
-            const int w = index % width;
-            const int h = (index / width) % height;
-            const int c = (index / width / height) % channels;
-            const int n = index / width / height / channels;
-            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            const int phend = min(h / stride_h + 1, pooled_height);
-            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            const int pwend = min(w / stride_w + 1, pooled_width);
-            Dtype gradient = 0;
-            rand_idx =
-                rand_idx + (n * channels + c) * pooled_height * pooled_width;
-            top_diff =
-                top_diff + (n * channels + c) * pooled_height * pooled_width;
-            for (int ph = phstart; ph < phend; ++ph) {
-              for (int pw = pwstart; pw < pwend; ++pw) {
-                gradient += top_diff[ph * pooled_width + pw] *
-                    (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
-              }
-            }
-            bottom_diff[index] = gradient;
-
-	  }
-}
-template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel  void StoPoolBackward<float>(const int nthreads,
-    __global float* rand_idx, __global float* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global float* bottom_diff);
-template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward<double>(const int nthreads,
-    __global double* rand_idx, __global double* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global double* bottom_diff);
-
-template <class T>
-__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
-	int index = get_global_id(0);
-	if(index < count)
-		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
-}
-
-//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
-
-template <class T>
-__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
-	int index = get_global_id(0);
-        if(index < count)
-		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
-}
-
-template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
-template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
-
-template <class T>
-__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
-     int index = get_global_id(0);
-     if (index < num) {
-	T maxval = -FLT_MAX;
-        for (int i = 0; i <  dim; i++)
-	maxval = max( data[index*dim + i], maxval );
-        out[index] = maxval;
-      }
-}
-
-template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
-template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
-
-template <class T>
-__kernel void exp (const int num, __global T* data, __global T* out){
-        int index = get_global_id(0);
-        if (index < num) 
-        out[index] = exp(data[index]);
-}
-
-template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
-template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
-
-template <class T>
-__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){
-        //printf("softmax_div\n");
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        for(index; index < num*dim; index +=  total){
-        int n = index / dim;
-        data[index] /= scale[n];
-        }
-}
-
-template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
-template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data);
-
-template <class T>
-__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){
-    
-    int gid = get_global_id(0);
-    int size = get_global_size(0);
-    
-    resultScratch[gid] = 0.0;
-    for(int i = gid; i < num; i += size){
-    	resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    if(gid < 128)
-    	resultScratch[gid] += resultScratch[gid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(gid < 64)
-    	resultScratch[gid] += resultScratch[gid + 64];
-    if(gid < 32)
-    	resultScratch[gid] += resultScratch[gid + 32];
-    if(gid < 16)
-    	resultScratch[gid] += resultScratch[gid + 16];
-    if(gid < 8)
-    	resultScratch[gid] += resultScratch[gid + 8];
-    if(gid < 4)
-    	resultScratch[gid] += resultScratch[gid + 4];
-    if(gid < 2)
-    	resultScratch[gid] += resultScratch[gid + 2];
-    if(gid < 1){
-    	resultScratch[gid] += resultScratch[gid + 1];
-    	loss[0] = resultScratch[gid];
-    }
-
-}
-
-template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
-template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
-
-template <class T>
-__kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* out) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* out);
-template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global  double* out);
-
-template <class T>
-__kernel void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
-    int index = get_global_id(0);
-    if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
-template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
-
-template <class T>
-__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
-template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
-
-template <class T>
-__kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* channel_sum) {
-  int index = get_global_id(0);
-   if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* channel_sum);
-template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global double* channel_sum);
-
-template <class T>
-__kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_sum, __global T* data) {
-    int index = get_global_id(0);
-   if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const float* channel_sum, __global float* data);
-template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const double* channel_sum, __global double* data);
-
-template <class T>
-__kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const T* data_1, __global const T* data_2,
-    __global T* channel_dot) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-        int n = index / spatial_dim;
-        int s = index % spatial_dim;
-        T dot = 0;
-        for (int c = 0; c < channels; ++c) {
-            dot += (data_1[(n * channels + c) * spatial_dim + s]
-                 * data_2[(n * channels + c) * spatial_dim + s]);
-        }
-        channel_dot[index] = dot;
-    }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const float* data_1, __global const float* data_2,
-    __global float* channel_dot);
-template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const double* data_1, __global const double* data_2,
-    __global double* channel_dot);
-
-
-template <class T>
-__kernel void SoftmaxLossForwardGPU(const int nthreads,
-          __global T* prob_data, __global T* label,__global T* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global T* counts) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-        const int n = index / spatial_dim;
-        const int s = index % spatial_dim;
-        const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-        if (has_ignore_label_ && label_value == ignore_label_) {
-           loss[index] = 0;
-           counts[index] = 0;
-        } else {
-           loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      T(FLT_MIN)));
-        counts[index] = 1;
-    }
-  }
-}
-
-template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-          __global float* prob_data, __global float* label,__global float* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global float* counts);
-template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-          __global double* prob_data, __global double* label,__global double* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global double* counts);
-
-template <class T>
-__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
-          __global T* label,__global T* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, T* counts) {
-    const int channels = dim / spatial_dim;
-   int index  = get_global_id(0);
-   if(index <  nthreads) {
-       const int n = index / spatial_dim;
-       const int s = index % spatial_dim;
-       const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-
-      if (has_ignore_label_ && label_value == ignore_label_) {
-          for (int c = 0; c < channels; ++c) {
-              bottom_diff[n * dim + c * spatial_dim + s] = 0;
-          }
-          counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
-}
-
-
-template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
-          __global float* label,__global float* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, float* counts);
-
-template __attribute__ ((mangled_name(softmax_loss_bp_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
-          __global double* label,__global double* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, double* counts);
-
-
-template <class T>
-__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        int offset;
-	for(index; index < num; index +=  total){
-  	offset = (int) label[index];
-        data[index * dim + offset] -= 1;
-        }
-}
-
-template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
-template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
-
-template <class T>
-__kernel void scal (const int num, const T alpha, __global T* data){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        for(index; index < num; index +=  total){
-        data[index] = data[index] * alpha;
-        }
-}
-
-template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
-template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
-
-template <class T>
-__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
-	int index = get_global_id(0);
-        if (index < n)
-        y[index] = a[index] / b[index];
-}
-
-template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
-//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
-
-template <class T>
-__kernel void add_scalar (const int n, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] += alpha;
-}
-
-template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
-template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
-
-template <typename Dtype>
-__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] = in1[index] + in2[index] ;
-}
-template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
-template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
-
-template <class T>
-__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
-        int index = get_global_id(0);
-       if (index < n)
-        y[index] = a[index] * b[index];
-}
-
-template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
-template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
-
-
-template <class T>
-__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
-//           y[index] = a[index] + alpha;
-           y[index] = pow(a[index], alpha);
-}
-
-template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
-template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
-
-template <class T>
-__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){
-    int index = get_global_id(0);
-    if (index < n)
-        out[index] = in[index] * scale * mask[index];
-}
-template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
-template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
-
-
-template <class T>
-__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){
-    int index = get_global_id(0);
-    if (index < n)
-        out_diff[index] = in_diff[index] * scale * mask[index];
-}
-template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
-template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
-
-template <class T>
-__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k,  __global T* scale) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    in = in + offset;
-    scale = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    T accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in[(head - size) * step]
-                       * in[(head - size) * step];
-      }
-      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in[(head - size) * step]
-                       * in[(head - size) * step];
-      }
-      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
-}
-
-template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
-template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
-
-template <class T>
-__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) 
-    out[index] = in[index] * pow(scale[index], negative_beta);
-}
-template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
-template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
-
-template <class T>
-__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) {
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    T accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff[(head - size) * step] *
-            top_data[(head - size) * step] / scale[(head - size) * step];
-      }
-      bottom_diff[(head - post_pad) * step] =
-          top_diff[(head - post_pad) * step]
-            * pow(scale[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff[(head - size) * step] *
-            top_data[(head - size) * step] / scale[(head - size) * step];
-      }
-      bottom_diff[(head - post_pad) * step] =
-          top_diff[(head - post_pad) * step]
-            * pow(scale[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-}
-}
-
-template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
-template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
-
-template <class T>
-__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int gidy = get_global_id(1);
-     int gidyy = gidy;
-     int index = gidy / height;
-     int offset = index * width * height;
-     gidy = gidy % height;
-     if( gidx < width && gidyy < height * optnum )
-         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
-}
-template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
-template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
-
-template <class T>
-__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int index;
-     index = (optnum==1) ? 0: gidx % optnum;
-     dst = dst + top_offset; // now we point at (*top)[n]
-     int offset = gidx / optnum;
-     int i = 0;
-     for(i = 0 ; i < width; i++)
-         dst[(index * height + offset)* width + i] = src[gidx * width + i];
-}
-template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
-template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 0e98ada0..3beba234 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -125,11 +125,25 @@ cl_int Device::Init(){
         return 0;
     }
 
-    //Read our own kernel file
-    const char *pFileName = "./src/caffe/OCL_kernel.cl";
-    const char *pSource;
     std::string strSource = "";
-    ConvertToString(pFileName, strSource);
+
+    std::string pFileName[8];
+    pFileName[0] = "./src/caffe/ocl/OCL_kernel.cl";
+    pFileName[1] = "./src/caffe/ocl/lrn_layer.cl";
+    pFileName[2] = "./src/caffe/ocl/pooling_layer.cl";
+    pFileName[3] = "./src/caffe/ocl/dropout_layer.cl";
+    pFileName[4] = "./src/caffe/ocl/relu_layer.cl";
+    pFileName[5] = "./src/caffe/ocl/softmax_layer.cl";
+    pFileName[6] = "./src/caffe/ocl/softmaxwithloss_layer.cl";
+    pFileName[7] = "./src/caffe/ocl/im2col.cl";
+
+    for(int fileNum = 0; fileNum < 8; fileNum++) {
+      std::string tmpSource = "";
+      ConvertToString(pFileName[fileNum], tmpSource);
+      strSource += tmpSource;
+    }
+
+    const char *pSource;
     pSource = strSource.c_str();
     size_t uiArrSourceSize[] = {0};
     uiArrSourceSize[0] = strlen(pSource);
@@ -206,11 +220,12 @@ cl_int Device::Init(){
 
 
 //Use to read OpenCL source code
-cl_int Device::ConvertToString(const char *pFileName,std::string &Str){
+cl_int Device::ConvertToString(std::string pFileName,std::string &Str){
     size_t uiSize=0;
     size_t uiFileSize=0;
     char *pStr=NULL;
-    std::fstream fFile(pFileName,(std::fstream::in|std::fstream::binary));
+    char *tmp = (char*)pFileName.data();
+    std::fstream fFile(tmp,(std::fstream::in|std::fstream::binary));
     if(fFile.is_open()){
         fFile.seekg(0,std::fstream::end);
         uiSize=uiFileSize=(size_t)fFile.tellg();
@@ -232,7 +247,7 @@ cl_int Device::ConvertToString(const char *pFileName,std::string &Str){
     return -1;
 }
 
-cl_program Device::BuildProgram(const char *pFileName)
+cl_program Device::BuildProgram(std::string pFileName)
 {
       //Read our own kernel file
     const char *pSource;
diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl
new file mode 100644
index 00000000..7014721b
--- /dev/null
+++ b/src/caffe/ocl/OCL_kernel.cl
@@ -0,0 +1,999 @@
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+//beginning of the looooooong gpu_random_generator kernel 
+//we use the open sourced threefry's GPU implementation
+typedef uint uint32_t;
+
+struct r123array4x32 {	uint32_t v[4]; };
+
+enum r123_enum_threefry32x4 
+{
+	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
+	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
+	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
+	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
+	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
+	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
+	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
+	R_32x4_7_0 = 18, R_32x4_7_1 = 20
+};
+
+inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
+inline uint32_t RotL_32(uint32_t x, unsigned int N)
+{
+	return (x << (N & 31)) | (x >> ((32 - N) & 31));
+}
+
+typedef struct r123array4x32 threefry4x32_ctr_t;
+typedef struct r123array4x32 threefry4x32_key_t;
+typedef struct r123array4x32 threefry4x32_ukey_t;
+
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
+{
+	threefry4x32_ctr_t	X;
+	uint32_t			ks[4 + 1];
+	int					i;
+	ks[4] = 0x1BD11BDA;
+	/*
+	for (i = 0; i < 4; i++)
+	{
+		ks[i] = k.v[i];
+		X.v[i] = in.v[i];
+		ks[4] ^= k.v[i];
+	}*/ 
+	{
+		ks[0] = k.v[0];
+		X.v[0] = in.v[0];
+		ks[4] ^= k.v[0];
+
+		ks[1] = k.v[1];
+		X.v[1] = in.v[1];
+		ks[4] ^= k.v[1];
+
+		ks[2] = k.v[2];
+		X.v[2] = in.v[2];
+		ks[4] ^= k.v[2];
+
+		ks[3] = k.v[3];
+		X.v[3] = in.v[3];
+		ks[4] ^= k.v[3];
+	}
+	X.v[0] += ks[0];
+	X.v[1] += ks[1];
+	X.v[2] += ks[2];
+	X.v[3] += ks[3];
+	if (Nrounds > 0) 
+	{
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 1) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 2) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 3) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 3) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 1;
+	} if (Nrounds > 4) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 5) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 6) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 7) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 7) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 2;
+	} if (Nrounds > 8) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 9) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 10) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 11) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 11) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 3;
+	} if (Nrounds > 12) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 13) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 14) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 15) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 15) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 4;
+	} if (Nrounds > 16) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 17) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 18) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 19) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 19) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 5;
+	} if (Nrounds > 20) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 21) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 22) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 23) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 23) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 6;
+	} if (Nrounds > 24) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 25) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 26) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 27) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 27) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 7;
+	} if (Nrounds > 28) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 29) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 30) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 31) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 31) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 8;
+	} if (Nrounds > 32) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 33) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 34) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 35) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 35) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 9;
+	} if (Nrounds > 36) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 37) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 38) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 39) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 39) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 10;
+	} if (Nrounds > 40) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 41) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 42) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 43) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 43) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 11;
+	} if (Nrounds > 44) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 45) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 46) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 47) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 47) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 12;
+	} if (Nrounds > 48) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 49) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 50) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 51) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 51) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 13;
+	} if (Nrounds > 52) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 53) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 54) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 55) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 55) {
+		X.v[0] += ks[4];
+		X.v[1] += ks[0];
+		X.v[2] += ks[1];
+		X.v[3] += ks[2];
+		X.v[4 - 1] += 14;
+	} if (Nrounds > 56) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 57) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 58) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 59) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 59) {
+		X.v[0] += ks[0];
+		X.v[1] += ks[1];
+		X.v[2] += ks[2];
+		X.v[3] += ks[3];
+		X.v[4 - 1] += 15;
+	} if (Nrounds > 60) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 61) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 62) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 63) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 63) {
+		X.v[0] += ks[1];
+		X.v[1] += ks[2];
+		X.v[2] += ks[3];
+		X.v[3] += ks[4];
+		X.v[4 - 1] += 16;
+	} if (Nrounds > 64) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 65) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 66) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 67) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 67) {
+		X.v[0] += ks[2];
+		X.v[1] += ks[3];
+		X.v[2] += ks[4];
+		X.v[3] += ks[0];
+		X.v[4 - 1] += 17;
+	} if (Nrounds > 68) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 69) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 70) {
+		X.v[0] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+		X.v[1] ^= X.v[0];
+		X.v[2] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+		X.v[3] ^= X.v[2];
+	} if (Nrounds > 71) {
+		X.v[0] += X.v[3];
+		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+		X.v[3] ^= X.v[0];
+		X.v[2] += X.v[1];
+		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+		X.v[1] ^= X.v[2];
+	} if (Nrounds > 71) {
+		X.v[0] += ks[3];
+		X.v[1] += ks[4];
+		X.v[2] += ks[0];
+		X.v[3] += ks[1];
+		X.v[4 - 1] += 18;
+	} 
+	return X;
+} 
+
+template <class T>
+__kernel void PRNG_threefry4x32(
+        __global uint4 *randomnumber,
+        threefry4x32_ctr_t ctr_i,
+        T inf,
+        T sup,
+        T threshold,
+        uint nrounds,
+        uint numrandom
+){
+        size_t  gdx = get_global_id(0);
+
+        uint maxUint = 0;
+        maxUint--;
+        float r = (float)maxUint;
+
+        threefry4x32_ctr_t      ctr = ctr_i; 
+        threefry4x32_ukey_t ukey;
+
+        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+        threefry4x32_ctr_t  random4;
+
+        if ( gdx < numrandom )
+        {
+                random4 = threefry4x32_R(nrounds, ctr, ukey);
+                uint4 frnd;
+				
+                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+				
+                randomnumber[gdx] = frnd;
+        }
+}
+
+
+template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
+
+//end of the looooooong gpu_random_generator kernel 
+
+
+template <class T>
+__kernel void OCL_memset(__global T* buffer, const T value, const int size){
+	int gdx = get_global_id(0);
+	if(gdx < size){
+		buffer[gdx] = value;	
+	}
+}
+
+template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
+template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
+
+__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
+        int gdx = get_global_id(0);
+        if(gdx < size){
+                buffer[gdx] = value;    
+        }
+}
+
+template <class T>
+__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
+     int gdx = get_global_id(0);
+     if(gdx < N){
+          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
+     }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
+
+
+template <class T>
+__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
+     int index = get_global_id(0);
+     if (index < num) {
+	T maxval = -FLT_MAX;
+        for (int i = 0; i <  dim; i++)
+	maxval = max( data[index*dim + i], maxval );
+        out[index] = maxval;
+      }
+}
+
+template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
+
+template <class T>
+__kernel void exp (const int num, __global T* data, __global T* out){
+        int index = get_global_id(0);
+        if (index < num) 
+        out[index] = exp(data[index]);
+}
+
+template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
+
+
+
+template <class T>
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global  double* out);
+
+template <class T>
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    int index = get_global_id(0);
+    if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
+
+template <class T>
+__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = exp(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
+
+template <class T>
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+   if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* channel_sum);
+template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global double* channel_sum);
+
+template <class T>
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+    int index = get_global_id(0);
+   if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const float* channel_sum, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const double* channel_sum, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const T* data_1, __global const T* data_2,
+    __global T* channel_dot) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        T dot = 0;
+        for (int c = 0; c < channels; ++c) {
+            dot += (data_1[(n * channels + c) * spatial_dim + s]
+                 * data_2[(n * channels + c) * spatial_dim + s]);
+        }
+        channel_dot[index] = dot;
+    }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const float* data_1, __global const float* data_2,
+    __global float* channel_dot);
+template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const double* data_1, __global const double* data_2,
+    __global double* channel_dot);
+
+
+
+template <class T>
+__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        int offset;
+	for(index; index < num; index +=  total){
+  	offset = (int) label[index];
+        data[index * dim + offset] -= 1;
+        }
+}
+
+template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
+template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
+
+template <class T>
+__kernel void scal (const int num, const T alpha, __global T* data){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        for(index; index < num; index +=  total){
+        data[index] = data[index] * alpha;
+        }
+}
+
+template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
+template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
+
+template <class T>
+__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
+	int index = get_global_id(0);
+        if (index < n)
+        y[index] = a[index] / b[index];
+}
+
+template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
+//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
+
+template <class T>
+__kernel void add_scalar (const int n, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] += alpha;
+}
+
+template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
+
+template <typename Dtype>
+__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] = in1[index] + in2[index] ;
+}
+template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
+template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
+
+template <class T>
+__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
+        int index = get_global_id(0);
+       if (index < n)
+        y[index] = a[index] * b[index];
+}
+
+template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
+template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
+
+
+template <class T>
+__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+//           y[index] = a[index] + alpha;
+           y[index] = pow(a[index], alpha);
+}
+
+template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
+template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
+
+
+template <class T>
+__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int gidy = get_global_id(1);
+     int gidyy = gidy;
+     int index = gidy / height;
+     int offset = index * width * height;
+     gidy = gidy % height;
+     if( gidx < width && gidyy < height * optnum )
+         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+}
+template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
+template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
+
+template <class T>
+__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int index;
+     index = (optnum==1) ? 0: gidx % optnum;
+     dst = dst + top_offset; // now we point at (*top)[n]
+     int offset = gidx / optnum;
+     int i = 0;
+     for(i = 0 ; i < width; i++)
+         dst[(index * height + offset)* width + i] = src[gidx * width + i];
+}
+template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
new file mode 100644
index 00000000..3b1c479b
--- /dev/null
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -0,0 +1,18 @@
+template <class T>
+__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){
+    int index = get_global_id(0);
+    if (index < n)
+        out[index] = in[index] * scale * mask[index];
+}
+template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
+template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
+
+
+template <class T>
+__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){
+    int index = get_global_id(0);
+    if (index < n)
+        out_diff[index] = in_diff[index] * scale * mask[index];
+}
+template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
+template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
new file mode 100644
index 00000000..577dd58f
--- /dev/null
+++ b/src/caffe/ocl/im2col.cl
@@ -0,0 +1,298 @@
+template <class T>
+__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
+    int index=get_global_id(0);
+    data_im = data_im + img_offset;
+    data_col =  data_col + col_offset;
+    if(index < n){
+        int w_out=index %width_col;
+        index /= width_col;
+        int h_out=index%height_col;
+        int channel_in = index/height_col;
+        int channel_out=channel_in *ksize *ksize;
+        int h_in = h_out *stride-pad;
+        int w_in = w_out *stride-pad;
+        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+        data_im +=(channel_in * height + h_in) *width + w_in;
+        int i=0,j=0;
+        for(i=0;i<ksize;++i){
+            for(j=0;j<ksize;++j){
+                int h = h_in+i;
+                int w = w_in+j;
+                if(h >= 0 && w >= 0 && h < height && w < width)
+                    *data_col=data_im[i * width + j];
+                else *data_col=0;
+                data_col +=height_col *width_col;
+            }
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
+template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); 
+
+template <class T>
+__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){
+
+    int index = get_global_id(0);
+
+    data_im = data_im + img_offset;
+    data_col = data_col + col_offset;
+
+    int x_out = index % width_col;
+    int y_out = (index / width_col) % height_col;
+    int channel_in = (index / width_col / height_col) % channels;
+    int channel_out = channel_in * ksize * ksize;
+    int im_id = index / width_col / height_col / channels;
+
+    int y_in = y_out * stride - pad;
+    int x_in = x_out * stride - pad;
+    int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
+    int offset_im = im_id * channels * height * width + channel_in * height * width;
+
+    for(int k_h = 0; k_h < ksize; k_h++){
+        for(int k_w = 0; k_w < ksize; k_w++){
+            int x_im = x_in + k_w;
+            int y_im = y_in + k_h;
+            int index_im = y_im * width + x_im;
+            int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+            if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
+                data_col[offset_col + index_col] = data_im[offset_im + index_im];
+            else
+                data_col[offset_col + index_col] = 0;
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
+template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
+
+
+template <class T>
+__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_col, const int col_offset) {
+    data_im = data_im + img_offset;
+    data_col = data_col + col_offset;     
+
+    int index = get_global_id(0);
+    if(index < n) {
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * kernel_h * kernel_w;
+        int h_in = h_out * stride_h - pad_h;
+        int w_in = w_out * stride_w - pad_w;
+        __global T* data_col_ptr = data_col;
+        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        __global const T* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < kernel_h; ++i) {
+            for (int j = 0; j < kernel_w; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
+                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                            data_im_ptr[i * width + j] : 0;
+                data_col_ptr += height_col * width_col;
+        }
+    }
+  }
+}
+
+template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
+           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+           const int height_col, const int width_col, __global float* data_col, const int col_offset);
+template __attribute__((mangled_name(im2col_gpu_double_kernel)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+           const int height_col, const int width_col, __global double* data_col, const int col_offset);
+
+template <class T>
+__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset) {
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+   int index = get_global_id(0);
+    if(index < n) {
+        T val = 0;
+        int w = index % width + pad_w;
+        int h = (index / width) % height + pad_h;
+        int c = index / (width * height);
+        // compute the start and end of the output
+        int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+        int w_col_end = min(w / stride_w + 1, width_col);
+        int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+        int h_col_end = min(h / stride_h + 1, height_col);
+        // equivalent implementation
+        int offset =
+            (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+        int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+        int coeff_w_col = (1 - stride_w * height_col * width_col);
+        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+            for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+            }
+        }
+        data_im[index] = val;
+  }
+}
+
+template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
+    									const int height, const int width, const int channels,
+    									const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+    									const int stride_h, const int stride_w,const int height_col, const int width_col,
+    									__global float* data_im, const int img_offset);
+template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
+                                         const int col_offset, const int height, const int width, const int channels,
+                                         const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+                                         const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+
+template <class T>
+__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){
+    int index = get_global_id(0);
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+    if(index < n){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height);
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); 
+template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); 
+
+template <class T>
+__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index;index<n;index+=tmp){
+        int w_out=index %width_col;
+        index /= width_col;
+        int h_out=index%height_col;
+        int channel_in = index/height_col;
+        int channel_out=channel_in *ksize *ksize;
+        int h_in = h_out *stride-pad;
+        int w_in = w_out *stride-pad;
+        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+        data_im +=(channel_in * height + h_in) *width + w_in;
+        int i=0,j=0;
+        for(i=0;i<ksize;++i){
+            for(j=0;j<ksize;++j){
+                int h = h_in+i;
+                int w = w_in+j;
+                if(h >= 0 && w >= 0 && h < height && w < width)
+                    *data_col=data_im[i * width + j];
+                else *data_col=0;
+                data_col += height_col *width_col;
+            }
+        }
+    }
+}
+
+template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); 
+template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); 
+
+template <class T>
+__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){
+    int index = get_global_id(0);
+    data_col = data_col + col_offset;
+    data_im = data_im + img_offset;
+    if(index < n){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height) % channels;
+      int im = index / width / height / channels;
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col * optnum);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
+template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
+
+
+template <class T>
+__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < n; index += tmp){
+      T val = 0;
+      int w = index % width + pad;
+      int h = (index / width) % height + pad;
+      int c = index / (width * height);
+      // compute the start and end of the output
+      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+      int w_col_end = min(w / stride + 1, width_col);
+      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+      int h_col_end = min(h / stride + 1, height_col);
+      // equivalent implementation
+      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+      int coeff_w_col = (1 - stride * height_col * width_col);
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+      data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); 
+template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); 
+
+template <class T>
+__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){
+
+    int index = get_global_id(0);
+    data_opt = data_opt + opt_offset;
+    data_im = data_im + im_offset;
+    if(index < n){
+      int w = index % width;
+      int h = (index / width) % height;
+      int c = index / (width * height) % channels;
+      int im = index / width / height / channels;
+
+      int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
+      data_opt[opt_index] = data_im[index];
+    }
+}
+template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
+template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
new file mode 100644
index 00000000..901b5b13
--- /dev/null
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -0,0 +1,113 @@
+template <class T>
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) 
+    out[index] = in[index] * pow(scale[index], negative_beta);
+}
+template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
+template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
+
+template <class T>
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k,  __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+                       * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+                       * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
+template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
+template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
+
+template <class T>
+__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+            top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+          top_diff[(head - post_pad) * step]
+            * pow(scale[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+            top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+          top_diff[(head - post_pad) * step]
+            * pow(scale[(head - post_pad) * step], negative_beta)
+          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+}
+}
+
+template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
new file mode 100644
index 00000000..5ac4bd52
--- /dev/null
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -0,0 +1,267 @@
+template <class T>
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
+     int index = get_global_id(0);
+     int tmp = get_global_size(0);
+     for(index; index < nthreads; index += tmp){
+         int pw = index % pooled_width;
+         int ph = (index / pooled_width) % pooled_height;
+         int c = (index / pooled_width / pooled_height) % channels;
+         int n = index / pooled_width / pooled_height / channels;
+         int hstart = ph * stride_h - pad_h;
+         int wstart = pw * stride_w - pad_w;
+         const int hend = min(hstart + kernel_h, height);
+         const int wend = min(wstart + kernel_w, width);
+         hstart = max(hstart, 0);
+         wstart = max(wstart, 0);
+        T maxval = -FLT_MAX;
+        int maxidx = -1;
+        bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+           if (bottom_data[h * width + w] > maxval) {
+             maxidx = h * width + w;
+             maxval = bottom_data[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask) {
+      mask[index] = maxidx;
+    } else {
+      top_mask[index] = maxidx;
+    }
+  }
+}
+template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
+template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
+
+template <class T>
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < nthreads; index+=tmp){
+        int pw = index % pooled_width;
+        int ph = (index / pooled_width) % pooled_height;
+        int c = (index / pooled_width / pooled_height) % channels;
+        int n = index / pooled_width / pooled_height / channels;            int hstart = ph * stride_h - pad_h;            int wstart = pw * stride_w - pad_w;
+            int hend = min(hstart + kernel_h, height + pad_h);
+            int wend = min(wstart + kernel_w, width + pad_w);
+            const int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, height);
+            wend = min(wend, width);
+            T aveval = 0;
+            bottom_data =
+                bottom_data + (n * channels + c) * height * width;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                aveval += bottom_data[h * width + w];
+              }
+            }
+            top_data[index] = aveval / pool_size;
+          }
+
+}
+template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
+template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
+
+template <class T>
+__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < nthreads; index+=tmp){
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    T cumsum = 0.;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+      }
+    }
+    const float thres = rand_idx[index] * cumsum;
+    // Second pass: get value, and set index.
+    cumsum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        if (cumsum >= thres) {
+          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
+          top_data[index] = bottom_data[h * width + w];
+          return;
+        }
+      }
+    }
+    }
+}
+template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
+
+template <class T>
+__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
+    int index = get_global_id(0);
+    int tmp = get_global_size(0);
+    for(index; index < count; index+=tmp){
+    const int pw = index % pooled_width; 
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
+    T cumsum = FLT_MIN;
+    T cumvalues = 0.;
+    bottom_data =        bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = cumvalues / cumsum;  }
+}
+template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
+
+template <class T>
+__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
+    __global int* mask, __global T* top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, __global T* const bottom_diff) {
+     int index = get_global_id(0);
+     int total = get_global_size(0);
+     for(index; index < nthreads; index += total){
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    top_diff += offset;
+    if (mask) {
+      mask = mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    } else {
+      top_mask = top_mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (top_mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+
+template <class T>
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){
+     int index = get_global_id(0);
+     int total = get_global_size(0);
+     for(index; index < nthreads; index += total){
+            int w = index % width + pad_w;
+            int h = (index / width) % height + pad_h;
+            int c = (index / width / height) % channels;
+            int n = index / width / height / channels;
+            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            const int phend = min(h / stride_h + 1, pooled_height);
+            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            const int pwend = min(w / stride_w + 1, pooled_width);
+            T gradient = 0;
+            top_diff += (n * channels + c) * pooled_height * pooled_width;
+            for (int ph = phstart; ph < phend; ++ph) {
+              for (int pw = pwstart; pw < pwend; ++pw) {
+                // figure out the pooling size
+                int hstart = ph * stride_h - pad_h;
+                int wstart = pw * stride_w - pad_w;
+                int hend = min(hstart + kernel_h, height + pad_h);
+                int wend = min(wstart + kernel_w, width + pad_w);
+                int pool_size = (hend - hstart) * (wend - wstart);
+                gradient += top_diff[ph * pooled_width + pw] / pool_size;
+              }
+    }
+    bottom_diff[index] = gradient;
+   }
+}
+
+template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+
+template <class Dtype>
+void StoPoolBackward(const int nthreads,
+    __global Dtype* rand_idx, __global Dtype* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global Dtype* bottom_diff) {
+      int index = get_global_id(0);
+      int total = get_global_size(0);
+      for(index; index < nthreads; index += total){
+            // find out the local index
+            // find out the local offset
+            const int w = index % width;
+            const int h = (index / width) % height;
+            const int c = (index / width / height) % channels;
+            const int n = index / width / height / channels;
+            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+            const int phend = min(h / stride_h + 1, pooled_height);
+            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+            const int pwend = min(w / stride_w + 1, pooled_width);
+            Dtype gradient = 0;
+            rand_idx =
+                rand_idx + (n * channels + c) * pooled_height * pooled_width;
+            top_diff =
+                top_diff + (n * channels + c) * pooled_height * pooled_width;
+            for (int ph = phstart; ph < phend; ++ph) {
+              for (int pw = pwstart; pw < pwend; ++pw) {
+                gradient += top_diff[ph * pooled_width + pw] *
+                    (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
+              }
+            }
+            bottom_diff[index] = gradient;
+
+	  }
+}
+template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel  void StoPoolBackward<float>(const int nthreads,
+    __global float* rand_idx, __global float* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global float* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward<double>(const int nthreads,
+    __global double* rand_idx, __global double* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
new file mode 100644
index 00000000..0d8d3b4e
--- /dev/null
+++ b/src/caffe/ocl/relu_layer.cl
@@ -0,0 +1,20 @@
+template <class T>
+__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
+	int index = get_global_id(0);
+	if(index < count)
+		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
+}
+
+//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
+
+template <class T>
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
+	int index = get_global_id(0);
+        if(index < count)
+		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
+}
+
+template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
new file mode 100644
index 00000000..711e4334
--- /dev/null
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -0,0 +1,48 @@
+template <class T>
+__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){
+    
+    int gid = get_global_id(0);
+    int size = get_global_size(0);
+    
+    resultScratch[gid] = 0.0;
+    for(int i = gid; i < num; i += size){
+    	resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if(gid < 128)
+    	resultScratch[gid] += resultScratch[gid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(gid < 64)
+    	resultScratch[gid] += resultScratch[gid + 64];
+    if(gid < 32)
+    	resultScratch[gid] += resultScratch[gid + 32];
+    if(gid < 16)
+    	resultScratch[gid] += resultScratch[gid + 16];
+    if(gid < 8)
+    	resultScratch[gid] += resultScratch[gid + 8];
+    if(gid < 4)
+    	resultScratch[gid] += resultScratch[gid + 4];
+    if(gid < 2)
+    	resultScratch[gid] += resultScratch[gid + 2];
+    if(gid < 1){
+    	resultScratch[gid] += resultScratch[gid + 1];
+    	loss[0] = resultScratch[gid];
+    }
+}
+template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
+template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
+
+template <class T>
+__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){
+        //printf("softmax_div\n");
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        for(index; index < num*dim; index +=  total){
+        int n = index / dim;
+        data[index] /= scale[n];
+        }
+}
+
+template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
+template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
new file mode 100644
index 00000000..6d6e4f0b
--- /dev/null
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -0,0 +1,65 @@
+template <class T>
+__kernel void SoftmaxLossForwardGPU(const int nthreads,
+          __global T* prob_data, __global T* label,__global T* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global T* counts) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+        const int n = index / spatial_dim;
+        const int s = index % spatial_dim;
+        const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+           loss[index] = 0;
+           counts[index] = 0;
+        } else {
+           loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+                      T(FLT_MIN)));
+        counts[index] = 1;
+    }
+  }
+}
+
+template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+          __global float* prob_data, __global float* label,__global float* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global float* counts);
+template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+          __global double* prob_data, __global double* label,__global double* loss,
+          int num, int dim, int spatial_dim,
+          bool has_ignore_label_, int ignore_label_,
+          __global double* counts);
+
+template <class T>
+__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
+          __global T* label,__global T* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, T* counts) {
+    const int channels = dim / spatial_dim;
+   int index  = get_global_id(0);
+   if(index <  nthreads) {
+       const int n = index / spatial_dim;
+       const int s = index % spatial_dim;
+       const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+
+      if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int c = 0; c < channels; ++c) {
+              bottom_diff[n * dim + c * spatial_dim + s] = 0;
+          }
+          counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
+}
+template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
+          __global float* label,__global float* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, float* counts);
+
+template __attribute__ ((mangled_name(softmax_loss_bp_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+          __global double* label,__global double* bottom_diff, int num, int dim,
+          int spatial_dim, bool has_ignore_label_,
+          int ignore_label_, double* counts);

From 858b0828b95273b99f598d4ac1379459b172a648 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sat, 8 Aug 2015 08:12:32 +0800
Subject: [PATCH 030/124] Created global kernel map

---
 include/caffe/device.hpp           |   7 +-
 include/caffe/neuron_layers.hpp    |  18 -----
 include/caffe/util/ocl_wrapper.hpp |   4 +-
 src/caffe/device.cpp               | 122 +++++++++++++++--------------
 src/caffe/layers/relu_layer.cpp    |  20 +----
 src/caffe/ocl/relu_layer.cl        |   9 +--
 src/caffe/util/ocl_wrapper.cpp     |  21 +++--
 7 files changed, 92 insertions(+), 109 deletions(-)

diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 0b534e57..cea343e8 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -23,7 +23,7 @@ class Device{
     cl_device_id * pDevices;
     clblasOrder col;
     clblasOrder row;
-    
+    std::map<std::string, cl_kernel> Kernels;    
      
     cl_int Init(); 
     cl_int ConvertToString(std::string pFileName,std::string &Str);
@@ -32,13 +32,14 @@ class Device{
 
     void GetDeviceInfo();
     
-    cl_program BuildProgram(std::string);    
+    void BuildProgram(std::string kernel_dir);    
 
     template <typename T>
     void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str);
     template <typename T>
     void appendBitfield(T info, T value, std::string name, std::string &str);
-    
+   
+    cl_kernel GetKernel(std::string kernel_name);    
 
 };
 extern char* buildOption;
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index bcb834de..9fe415f1 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -433,9 +433,7 @@ class ReLULayer : public NeuronLayer<Dtype> {
    */
   explicit ReLULayer(const LayerParameter& param)
       : NeuronLayer<Dtype>(param) {
-        ocl_setup();
     }
-  ~ReLULayer();
   virtual inline const char* type() const { return "ReLU"; }
 
  protected:
@@ -487,22 +485,6 @@ class ReLULayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-//OpenCL related setup
-  void ocl_setup();
-//OpenCL wrapper
-  void ReLUForward_gpu(int count, const Dtype *bottom_data,Dtype *top_data, Dtype negative_slope)
-  { 
-      ReLUForward(ReLUForward_kernel,count,bottom_data,top_data,negative_slope);
-  }
-  void ReLUBackward_gpu(int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff, Dtype negative_slope)
-  {
-      ReLUBackward(ReLUBackward_kernel,count,top_diff,bottom_data,bottom_diff,negative_slope);
-  }
- protected:
-   cl_kernel ReLUForward_kernel;
-   cl_kernel ReLUBackward_kernel;
-
 };
 
 #ifdef USE_CUDNN
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 35ad695e..8d5a6a50 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -68,10 +68,10 @@ template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
 
 template <typename Dtype>
-void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
 
 template <typename Dtype>
-void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
 template <typename Dtype>
 void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 3beba234..3f3fcf27 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -4,6 +4,8 @@
 #include <fstream>
 #include <iostream>
 #include <malloc.h>
+#include <dirent.h>
+
 namespace caffe {
 //delete it after test, Yibing
 cl_mem test_alloc_mem[10];
@@ -11,6 +13,7 @@ extern long long unsigned device_mem_consumption;
 
 Device amdDevice;
 char* buildOption = "-x clc++ ";
+std::string oclKernelPath="./src/caffe/ocl/";
 
 Device::~Device(){
     //clAmdBlasTeardown(); 
@@ -41,15 +44,6 @@ cl_int Device::Init(){
     }
     platformName[nameLen] = 0;
 
-    //Get OpenCL Information 
-    //res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_VERSION, 64, openclVersion, &nameLen);
-    //if(res != CL_SUCCESS) {
-    //    fprintf(stderr, "Err: Get OpenCL Info failed!\n", res);
-    //    return 0;
-    //}
-    //openclVersion[nameLen] = 0;
-    //printf("%s %s\n", platformName, openclVersion);
-  
     GetDeviceInfo();
     cl_uint uiNumDevices;
     cl_bool unified_memory = false;
@@ -124,57 +118,13 @@ cl_int Device::Init(){
         fprintf(stderr,"Err: Failed to Create Commandqueue\n");
         return 0;
     }
+   
+  
+    //BuildProgram from OpenCL kernel files
+    BuildProgram(oclKernelPath);
 
-    std::string strSource = "";
-
-    std::string pFileName[8];
-    pFileName[0] = "./src/caffe/ocl/OCL_kernel.cl";
-    pFileName[1] = "./src/caffe/ocl/lrn_layer.cl";
-    pFileName[2] = "./src/caffe/ocl/pooling_layer.cl";
-    pFileName[3] = "./src/caffe/ocl/dropout_layer.cl";
-    pFileName[4] = "./src/caffe/ocl/relu_layer.cl";
-    pFileName[5] = "./src/caffe/ocl/softmax_layer.cl";
-    pFileName[6] = "./src/caffe/ocl/softmaxwithloss_layer.cl";
-    pFileName[7] = "./src/caffe/ocl/im2col.cl";
-
-    for(int fileNum = 0; fileNum < 8; fileNum++) {
-      std::string tmpSource = "";
-      ConvertToString(pFileName[fileNum], tmpSource);
-      strSource += tmpSource;
-    }
-
-    const char *pSource;
-    pSource = strSource.c_str();
-    size_t uiArrSourceSize[] = {0};
-    uiArrSourceSize[0] = strlen(pSource);
-    Program = NULL;
-    Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
-    if(NULL == Program){
-        fprintf(stderr,"Err: Failed to create program\n");
-    }
-
-    //Build Program
-    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
-    LOG(INFO) << "Build Program";
-    if(CL_SUCCESS != iStatus){
-        fprintf(stderr,"Err: Failed to build program\n");
-        char szBuildLog[16384];
-        clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
-        std::cout << szBuildLog;
-        clReleaseProgram(Program);
-    }
-
-    /*
-    //Setup AmdBlas;
-    cl_int err;
-    err = clAmdBlasSetup();
-    if(err != CL_SUCCESS){
-        printf("clAmdBlasSetup() failed with %d\n", err);
-    }
-    */
     row = clblasRowMajor;
     col = clblasColumnMajor;
-	
 	/* 
 	//delete after test the large buffer allocation, Yibing	
 	long long global_mem_size_limit = 1024*1024; //4*1024*1024*1024;
@@ -218,6 +168,50 @@ cl_int Device::Init(){
     return 0;
 }
 
+void Device::BuildProgram(std::string kernel_dir)
+{  
+  //Access opencl kernel files
+    std::string strSource = "";
+    DIR *ocl_dir;
+    struct dirent *dirp;
+    if((ocl_dir=opendir(kernel_dir.c_str())) == NULL)
+    {
+        printf("Open ocl dir failed!\n");
+    }
+    while((dirp = readdir(ocl_dir)) != NULL)
+    {  
+        //Ignore hidden files
+        if(dirp->d_name[0] == '.')
+            continue;
+        std::string ocl_kernel_full_path=kernel_dir+std::string(dirp->d_name);
+        std::string tmpSource = "";
+        ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
+        strSource += tmpSource;
+    }
+
+    const char *pSource;
+    pSource = strSource.c_str();
+    size_t uiArrSourceSize[] = {0};
+    uiArrSourceSize[0] = strlen(pSource);
+    Program = NULL;
+    Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
+    if(NULL == Program){
+        fprintf(stderr,"Err: Failed to create program\n");
+    }
+
+    //Build Program
+    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
+    LOG(INFO) << "Build Program";
+    if(CL_SUCCESS != iStatus){
+        fprintf(stderr,"Err: Failed to build program\n");
+        char szBuildLog[16384];
+        clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
+        std::cout << szBuildLog;
+        clReleaseProgram(Program);
+    }
+
+  // return Program;
+}
 
 //Use to read OpenCL source code
 cl_int Device::ConvertToString(std::string pFileName,std::string &Str){
@@ -247,6 +241,7 @@ cl_int Device::ConvertToString(std::string pFileName,std::string &Str){
     return -1;
 }
 
+/*
 cl_program Device::BuildProgram(std::string pFileName)
 {
       //Read our own kernel file
@@ -275,6 +270,19 @@ cl_program Device::BuildProgram(std::string pFileName)
     }
   return program;
 }
+*/
+cl_kernel Device::GetKernel(std::string kernel_name)
+{
+    std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
+    if(it == Kernels.end())
+    {
+        cl_int _err=0;
+        cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err);
+        OCL_CHECK(_err);
+        Kernels[kernel_name] = kernel;
+    }
+    return Kernels[kernel_name];
+}
 
 void Device::DisplayPlatformInfo(){
    cl_int err;
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 6ee3237a..c38814f1 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -5,22 +5,6 @@
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
-template <typename Dtype>
-void ReLULayer<Dtype>::ocl_setup(){
-    cl_program program = amdDevice.BuildProgram("src/caffe/layers/relu_layer.cl");
-    cl_int _err=0;
-    ReLUForward_kernel = clCreateKernel(program,"ReLUForwardfloat",&_err);
-    ReLUBackward_kernel = clCreateKernel(program,"ReLUBackwardfloat",&_err);
-}
-
-template <typename Dtype>
-ReLULayer<Dtype>::~ReLULayer(){
-  OCL_CHECK( clReleaseKernel(ReLUForward_kernel) );
-  OCL_CHECK( clReleaseKernel(ReLUBackward_kernel) );
-}
-
-
-
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -68,7 +52,7 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //     << " top_data: " << (unsigned long)top_data
   //     << " blocks: " << CAFFE_GET_BLOCKS(count)
   //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
- ReLUForward_gpu(count,bottom_data,top_data,negative_slope);
+ ReLUForward(count,bottom_data,top_data,negative_slope);
 }
 
 
@@ -86,7 +70,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 //    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
   //      count, top_diff, bottom_data, bottom_diff, negative_slope);
    // CUDA_POST_KERNEL_CHECK;
-   ReLUBackward_gpu(count,top_diff,bottom_data,bottom_diff,negative_slope);
+   ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope);
   }
 }
 
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index 0d8d3b4e..c9ba4900 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -5,9 +5,8 @@ __kernel void ReLUForward(const int count, __global T* in, __global T* out, T ne
 		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
 }
 
-//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
+template __attribute__ ((mangled_name(ReLUForwardFloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForwardDouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
 
 template <class T>
 __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
@@ -16,5 +15,5 @@ __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_
 		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
 }
 
-template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
-template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
+template __attribute__ ((mangled_name(ReLUBackwardFloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackwardDouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 7b57d329..f5f7e945 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -633,7 +633,11 @@ template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const fl
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
 template <typename Dtype> 
-void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
+    Dtype type;
+    std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double";
+    std::string kernel_name = std::string("ReLUForward")+str_type;
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
@@ -645,11 +649,16 @@ void ReLUForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, Dt
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void ReLUForward<float>(cl_kernel Kernel, const int count, const float* bottom_data, float* top_data, float negative_slope);
-template void ReLUForward<double>(cl_kernel Kernel, const int count, const double* bottom_data, double* top_data, double negative_slope);
+template void ReLUForward<float>(const int count, const float* bottom_data, float* top_data, float negative_slope);
+template void ReLUForward<double>(const int count, const double* bottom_data, double* top_data, double negative_slope);
 
 template <typename Dtype> 
-void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
+void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
+    Dtype type;
+    std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double";
+    std::string kernel_name = std::string("ReLUBackward")+str_type;
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
@@ -662,8 +671,8 @@ void ReLUBackward(cl_kernel Kernel, const int count, const Dtype* top_diff, cons
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
-template void ReLUBackward<float>(cl_kernel Kernel, const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
-template void ReLUBackward<double>(cl_kernel Kernel, const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
+template void ReLUBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
+template void ReLUBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
 template <typename Dtype>
 void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
     const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {

From 6934793436bcd0f6960d3a21e4830ea1ee5e09d5 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 9 Aug 2015 02:26:02 +0800
Subject: [PATCH 031/124] ocl wrappers get kernel from the map
 @amdDevice.Kernels instead of passing from outside; haven't finished in
 conv_layer

---
 include/caffe/common_layers.hpp         |   6 -
 include/caffe/neuron_layers.hpp         |   4 -
 include/caffe/util/ocl_util.hpp         |   2 +-
 include/caffe/util/ocl_wrapper.hpp      |  48 ++---
 include/caffe/vision_layers.hpp         |  14 +-
 src/caffe/common.cpp                    |   4 -
 src/caffe/device.cpp                    |   3 +-
 src/caffe/layers/base_conv_layer.cpp    |   2 +-
 src/caffe/layers/base_data_layer.cpp    |  26 +--
 src/caffe/layers/dropout_layer.cpp      |  19 +-
 src/caffe/layers/pooling_layer.cpp      |  44 +----
 src/caffe/layers/power_layer.cpp        |  28 +--
 src/caffe/layers/softmax_layer.cpp      |  30 +---
 src/caffe/layers/softmax_loss_layer.cpp |  14 +-
 src/caffe/ocl/OCL_kernel.cl             |   8 +-
 src/caffe/ocl/dropout_layer.cl          |   8 +-
 src/caffe/ocl/pooling_layer.cl          |  24 +--
 src/caffe/ocl/relu_layer.cl             |   8 +-
 src/caffe/ocl/softmaxwithloss_layer.cl  |   8 +-
 src/caffe/solver.cpp                    |   8 +-
 src/caffe/util/ocl_util.cpp             |  10 +-
 src/caffe/util/ocl_wrapper.cpp          | 221 ++++++++++++++++--------
 22 files changed, 247 insertions(+), 292 deletions(-)

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index a92bb4aa..eb77e762 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -483,7 +483,6 @@ class SoftmaxLayer : public Layer<Dtype> {
  public:
   explicit SoftmaxLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {
-     ocl_setup(); 
   }
   ~SoftmaxLayer();
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -502,7 +501,6 @@ class SoftmaxLayer : public Layer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void ocl_setup();
 
   int outer_num_;
   int inner_num_;
@@ -511,10 +509,6 @@ class SoftmaxLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
   /// scale is an intermediate Blob to hold temporary results.
   Blob<Dtype> scale_;
-  protected:
-      cl_kernel channel_max_kernel,channel_subtract_kernel,exp_kernel, channel_sum_kernel;
-      cl_kernel channel_div_kernel,channel_dot_kernel;
-  
 };
 
 #ifdef USE_CUDNN
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 9fe415f1..cf6d645a 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -412,10 +412,6 @@ class PowerLayer : public NeuronLayer<Dtype> {
   Dtype shift_;
   /// @brief Result of @f$ \alpha \gamma @f$
   Dtype diff_scale_;
-
- protected:
- void ocl_setup();
- cl_kernel memset_kernel, scalar_kernel, div_kernel, mul_kernel, powx_kernel;
 };
 
 /**
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 55695070..25747702 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -6,7 +6,7 @@
 namespace caffe {
 
 template <typename Dtype>
-void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count);
+void ocl_memset(Dtype* buffer, const Dtype value, const int count);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 8d5a6a50..71e13b2e 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -6,8 +6,8 @@
 namespace caffe {
 
 typedef unsigned int uint32_t;
-template <typename Dtype>
-void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
+//template <typename Dtype>
+//void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
 template <typename Dtype>
 void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
@@ -38,28 +38,28 @@ template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data);
 
 template <typename Dtype>
-void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask);
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask);
 
 template <typename Dtype>
-void MaxPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
 
 template <typename Dtype>
-void AvePoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
 
 template <typename Dtype>
- void StoPoolBackward(cl_kernel kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff);
+ void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff);
 
 template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data);
 
 template <typename Dtype>
-void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data);
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data);
 
 template <typename Dtype>
-void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data);
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data);
 
 template <typename Dtype>
-void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data);
+void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data);
 
 template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff );
@@ -74,53 +74,53 @@ template <typename Dtype>
 void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
 template <typename Dtype>
-void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
+void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
-void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
+void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
 
 template <typename Dtype>
-void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
 
 template <typename Dtype>
-void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
 template <typename Dtype>
 void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y );
 
 template <typename Dtype>
-void kernel_channel_max(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_max(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* out);
 
 template <typename Dtype>
-void kernel_channel_subtract(cl_kernel Kernel, const int count,
+void kernel_channel_subtract(const int count,
     const int num, const int channels,
     const int spatial_dim, const Dtype* channel_max, Dtype* data);
 
 template <typename Dtype>
-void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out);
+void kernel_exp(const int count, const Dtype* data, Dtype* out);
 
 template <typename Dtype>
-void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_sum(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* channel_sum);
 
 template <typename Dtype>
-void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data);
 
 template <typename Dtype>
-void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_dot(const int num, const int channels,
     const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
     Dtype* channel_dot);
 
 template <typename Dtype>
-void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
+void SoftmaxLossForwardGPU(const int nthreads,
           const Dtype* prob_data, const Dtype* label, Dtype* loss,
           const int num, const int dim, const int spatial_dim,
           const bool has_ignore_label_, const int ignore_label_,
           Dtype* counts);
 
 template <typename Dtype>
-void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top,
+void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
           const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
           const int spatial_dim, const bool has_ignore_label_,
           const int ignore_label_, Dtype* counts);
@@ -129,7 +129,7 @@ template <typename Dtype>
 void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data);
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
 
 template <typename Dtype>
 void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in,
@@ -149,10 +149,10 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
     const int width, const int size, const Dtype negative_beta,
     const Dtype cache_ratio, Dtype* const bottom_diff);
 template <typename Dtype>
-void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y);
+void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y);
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
   // namespace caffe
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 336127d5..a1c9577d 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -121,7 +121,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 }
 protected:
   inline void gpu_memset(Dtype* data, Dtype value, int count) {
-    ocl_memset(oclmem_kernel, data, value, count);
+    ocl_memset(data, value, count);
 }
 #endif
 
@@ -445,12 +445,10 @@ class PoolingLayer : public Layer<Dtype> {
  public:
   explicit PoolingLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
-  ~PoolingLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  void ocl_setup();
 
   virtual inline const char* type() const { return "Pooling"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
@@ -482,16 +480,6 @@ class PoolingLayer : public Layer<Dtype> {
   Blob<Dtype> rand_idx_;
   Blob<int> max_idx_;
 
-//opencl related data structures
-protected:
-  cl_kernel MaxPoolForward_kernel,
-            AvePoolForward_kernel,
-            StoPoolForwardTrain_kernel,
-            StoPoolForwardTest_kernel,
-            MaxPoolBackward_kernel,
-            AvePoolBackward_kernel,
-            StoPoolBackward_kernel;
-
 };
 
 #ifdef USE_CUDNN
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 407668c9..c4fe1195 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -112,10 +112,6 @@ Caffe::Caffe()
    if(err != CL_SUCCESS){
        LOG(ERROR) << "clBLAS setup failed "<<err;
    }
-   else
-   {
-      printf("clBLAS setup succeed!\n");
-   }
 }
 
 Caffe::~Caffe() {
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 3f3fcf27..c10ddf25 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -13,6 +13,7 @@ extern long long unsigned device_mem_consumption;
 
 Device amdDevice;
 char* buildOption = "-x clc++ ";
+//char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64";
 std::string oclKernelPath="./src/caffe/ocl/";
 
 Device::~Device(){
@@ -176,7 +177,7 @@ void Device::BuildProgram(std::string kernel_dir)
     struct dirent *dirp;
     if((ocl_dir=opendir(kernel_dir.c_str())) == NULL)
     {
-        printf("Open ocl dir failed!\n");
+        fprintf(stderr,"Err: Open ocl dir failed!\n");
     }
     while((dirp = readdir(ocl_dir)) != NULL)
     {  
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 8edecdc0..ea4a1658 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -547,7 +547,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
     // Bias gradient, if necessary.
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      ocl_memset(oclmem_kernel, bias_diff, (Dtype)(0.), this->blobs_[1]->count());
+      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
     for (int n = 0; n < num_; ++n) {
       caffe_gpu_gemv<Dtype>(CblasNoTrans, M_, N_,
           (Dtype)1., top_diff, top[i]->offset(n), N_,
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 1b6e07fa..71f5c132 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -60,13 +60,8 @@ template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   // First, join the thread
-  CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer;
-  join_prefetch_timer.Start();
   JoinPrefetchThread();
-  join_prefetch_timer.Stop();
-  printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds());
 
-  forward_timer.Start();
   DLOG(INFO) << "Thread joined";
   // Reshape to loaded data.
   top[0]->ReshapeLike(prefetch_data_);
@@ -81,33 +76,24 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
                top[1]->mutable_cpu_data());
   }
-  forward_timer.Stop();
-  printf("write buffer time: %f\n", forward_timer.MilliSeconds());
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
-  create_prefetch_timer.Start();
   CreatePrefetchThread();
-  create_prefetch_timer.Stop();
-  printf("create prefetch time: %f\n", create_prefetch_timer.MilliSeconds() );
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const  vector<Blob<Dtype>*>& top) {
-  CPUTimer forward_timer, join_prefetch_timer, create_prefetch_timer;
   
-  join_prefetch_timer.Start();
   JoinPrefetchThread();
-  join_prefetch_timer.Stop();
-  printf("join prefetch thread: %f\n", join_prefetch_timer.MilliSeconds());
+  DLOG(INFO) << "Thread joined";
   // Copy the data from prefetch thread to data_layer
    //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
   
-    clFinish(amdDevice.CommandQueue);
-       forward_timer.Start();
    top[0]->ReshapeLike(this->prefetch_data_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
-  if (this->output_labels_) {
+    DLOG(INFO) << "Prefetch copied"; 
+ if (this->output_labels_) {
        // Reshape to loaded labels.
    top[1]->ReshapeLike(prefetch_label_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
@@ -115,10 +101,7 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
    }
   
 //  clFinish(amdDevice.CommandQueue);
-  forward_timer.Stop();
-  printf("Write buffer time: %f\n\n", forward_timer.MilliSeconds());
 
- 
 #ifdef Track_data_transfer
 #endif
   
@@ -126,10 +109,7 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
 
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
-  create_prefetch_timer.Start();
   CreatePrefetchThread();
-  create_prefetch_timer.Stop();
-  printf("create_prefetch time: %f\n", create_prefetch_timer.MilliSeconds());
   //return Dtype(0.);
 }
 
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 996098bc..dfd6560d 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -12,19 +12,12 @@ namespace caffe {
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
-    //create OpenCL related cl_mem objects and kernels
-    ocl_Kernel_Fwd = clCreateKernel(amdDevice.Program,"DropoutForwardfloat", NULL);
-    ocl_Kernel_Bwd = clCreateKernel(amdDevice.Program,"DropoutBackwardfloat", NULL);
-    rng_kernel = clCreateKernel(amdDevice.Program,"RNGBernoulliFloat", NULL);
     MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL);
 }
 
 template <typename Dtype>
 DropoutLayer<Dtype>::~DropoutLayer(){
    OCL_CHECK( clReleaseMemObject(MaskMem) );
-   OCL_CHECK( clReleaseKernel(ocl_Kernel_Fwd) );
-   OCL_CHECK( clReleaseKernel(ocl_Kernel_Bwd) );
-   OCL_CHECK( clReleaseKernel(rng_kernel) );
 }
 
 
@@ -100,11 +93,11 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
         static_cast<unsigned int*>(rand_vec_.mutable_cpu_data()); 
     caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
     OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
-    DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+    DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
 //    caffe_gpu_rng_uniform(count, mask);
-     caffe_gpu_bernoulli(rng_kernel, (int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
-     DropoutForward(ocl_Kernel_Fwd, count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+     caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
+     DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #endif
     // set thresholds
     // NOLINT_NEXT_LINE(whitespace/operators)
@@ -112,9 +105,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //      count, bottom_data, mask, uint_thres_, scale_, top_data);
    // CUDA_POST_KERNEL_CHECK;
   } else {
-    //caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data);
-     if(bottom_data != top_data)
-       OCL_CHECK( clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)bottom_data, (cl_mem)top_data, 0, 0, count*sizeof(Dtype), 0, NULL, NULL) );
+    caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data);
   }
 }
 
@@ -135,7 +126,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        // CAFFE_CUDA_NUM_THREADS>>>(
          // count, top_diff, mask, uint_thres_, scale_, bottom_diff);
     //  CUDA_POST_KERNEL_CHECK;
-       DropoutBackward(ocl_Kernel_Bwd, count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
+       DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
     } else {
       caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
     }
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 3c94f0de..83b18c89 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -13,17 +13,6 @@ namespace caffe {
 using std::min;
 using std::max;
 
-template <typename Dtype>
-PoolingLayer<Dtype>::~PoolingLayer(){
-  OCL_CHECK( clReleaseKernel(MaxPoolForward_kernel) );
-  OCL_CHECK( clReleaseKernel(AvePoolForward_kernel) );
-  OCL_CHECK( clReleaseKernel(StoPoolForwardTrain_kernel) );
-  OCL_CHECK( clReleaseKernel(StoPoolForwardTest_kernel) );
-  OCL_CHECK( clReleaseKernel(MaxPoolBackward_kernel) );
-  OCL_CHECK( clReleaseKernel(AvePoolBackward_kernel) );  
-  OCL_CHECK( clReleaseKernel(StoPoolBackward_kernel) );
-}
-
 template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -87,20 +76,8 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     CHECK_LT(pad_h_, kernel_h_);
     CHECK_LT(pad_w_, kernel_w_);
   }
-  //Intialize OpenCL related
-  ocl_setup();
 }
 
-template <typename Dtype>
- void PoolingLayer<Dtype>::ocl_setup(){
-  MaxPoolForward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolForwardfloat", NULL);
-  AvePoolForward_kernel = clCreateKernel(amdDevice.Program, "AvePoolForwardfloat", NULL);
-  StoPoolForwardTrain_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTrainfloat", NULL);
-  StoPoolForwardTest_kernel = clCreateKernel(amdDevice.Program, "StoPoolForwardTestfloat", NULL);
-  MaxPoolBackward_kernel = clCreateKernel(amdDevice.Program, "MaxPoolBackwardfloat", NULL);
-  AvePoolBackward_kernel = clCreateKernel(amdDevice.Program, "AvePoolBackwardfloat", NULL);
-  StoPoolBackward_kernel = clCreateKernel(amdDevice.Program, "StoPoolBackwardfloat", NULL);
-}
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -352,8 +329,7 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       mask = max_idx_.mutable_gpu_data();
     }
     // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward(MaxPoolForward_kernel,
-        count, bottom_data, bottom[0]->num(), channels_,
+    MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_,
         height_, width_, pooled_height_, pooled_width_, kernel_h_,
         kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
         mask, top_mask);
@@ -367,8 +343,7 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     break;
  case PoolingParameter_PoolMethod_AVE:
     // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward(AvePoolForward_kernel,
-        count, bottom_data, bottom[0]->num(), channels_,
+    AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
         height_, width_, pooled_height_, pooled_width_, kernel_h_,
         kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
  /*
@@ -384,15 +359,13 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
                             rand_idx_.mutable_gpu_data());
       // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain(StoPoolForwardTrain_kernel,
-          count, bottom_data, bottom[0]->num(), channels_,
+      StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
           height_, width_, pooled_height_, pooled_width_, kernel_h_,
           kernel_w_, stride_h_, stride_w_,
           rand_idx_.mutable_gpu_data(), top_data);
     } else {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest(StoPoolForwardTest_kernel,
-          count, bottom_data, bottom[0]->num(), channels_,
+      StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
           height_, width_, pooled_height_, pooled_width_, kernel_h_,
           kernel_w_, stride_h_, stride_w_, top_data);
     }
@@ -425,23 +398,20 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       mask = max_idx_.gpu_data();
     }
     // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward(MaxPoolBackward_kernel,
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
+    MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
         height_, width_, pooled_height_, pooled_width_,
         kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
         bottom_diff);
     break;
   case PoolingParameter_PoolMethod_AVE:
     // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward(AvePoolBackward_kernel,
-        count, top_diff, top[0]->num(), channels_,
+    AvePoolBackward(count, top_diff, top[0]->num(), channels_,
         height_, width_, pooled_height_, pooled_width_, kernel_h_,
         kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
     break;
   case PoolingParameter_PoolMethod_STOCHASTIC:
     // NOLINT_NEXT_LINE(whitespace/operators)
-     StoPoolBackward(StoPoolBackward_kernel,
-        count, rand_idx_.gpu_data(), top_diff,
+     StoPoolBackward(count, rand_idx_.gpu_data(), top_diff,
         top[0]->num(), channels_, height_, width_, pooled_height_,
         pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
         bottom_diff);
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 94393f73..d3c374f1 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -18,18 +18,8 @@ void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   scale_ = this->layer_param_.power_param().scale();
   shift_ = this->layer_param_.power_param().shift();
   diff_scale_ = power_  * scale_;
- //OpenCL related set up
-  ocl_setup();
 }
 
-template <typename Dtype>
-void PowerLayer<Dtype>::ocl_setup(){
-   memset_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
-   scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
-   div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
-   powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
-   mul_kernel = clCreateKernel(amdDevice.Program, "element_mul_float", NULL);
-}
 
 // Compute y = (shift + scale * x)^power
 template <typename Dtype>
@@ -116,7 +106,7 @@ void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // Special case where we can ignore the input: scale or power is 0.
   if (diff_scale_ == Dtype(0)) {
     Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    ocl_memset(memset_kernel, top_data, value, count);
+    ocl_memset(top_data, value, count);
     return;
   }
   const Dtype* bottom_data = bottom[0]->gpu_data();
@@ -125,10 +115,10 @@ void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     caffe_gpu_scal(count, scale_, top_data);
   }
   if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(scalar_kernel, count, shift_, top_data);
+    caffe_gpu_add_scalar(count, shift_, top_data);
   }
   if (power_ != Dtype(1)) {
-    caffe_gpu_powx(powx_kernel, count, top_data, power_, top_data);
+    caffe_gpu_powx(count, top_data, power_, top_data);
   }
 }
 
@@ -140,7 +130,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const int count = bottom[0]->count();
     const Dtype* top_diff = top[0]->gpu_diff();
     if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      ocl_memset(memset_kernel, bottom_diff, diff_scale_,count);
+      ocl_memset(bottom_diff, diff_scale_,count);
     } else {
       const Dtype* bottom_data = bottom[0]->gpu_data();
       // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
@@ -152,7 +142,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
             Dtype(0), bottom_diff);
         if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(scalar_kernel, count, diff_scale_ * shift_, bottom_diff);
+          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
         }
       } else if (shift_ == Dtype(0)) {
         // Special case for y = (scale * x)^power
@@ -160,7 +150,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         //              = scale * power * (scale * x)^power * (scale * x)^(-1)
         //              = power * y / x
         const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(div_kernel, count, top_data, bottom_data, bottom_diff);
+        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
         caffe_gpu_scal(count, power_, bottom_diff);
       } else {
         caffe_gpu_copy(count, bottom_data, bottom_diff);
@@ -168,16 +158,16 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           caffe_gpu_scal(count, scale_, bottom_diff);
         }
         if (shift_ != Dtype(0)) {
-            caffe_gpu_add_scalar(scalar_kernel, count, shift_, bottom_diff);
+            caffe_gpu_add_scalar(count, shift_, bottom_diff);
         }
         const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(div_kernel, count, top_data, bottom_diff, bottom_diff);
+        caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
         if (diff_scale_ != Dtype(1)) {
           caffe_gpu_scal(count, diff_scale_, bottom_diff);
         }
       }
     }
-    caffe_gpu_mul(mul_kernel, count, top_diff, bottom_diff, bottom_diff);
+    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 07c2fcfc..27c18b7b 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -24,25 +24,9 @@ void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   scale_.Reshape(scale_dims);
 }
 
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::ocl_setup(){
-    cl_int err = 0;
-    channel_max_kernel  = clCreateKernel(amdDevice.Program, "kernel_channel_max_float", &err);
-    channel_subtract_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_subtract_float", &err);;
-    exp_kernel = clCreateKernel(amdDevice.Program, "kernel_exp_float", &err);;
-    channel_sum_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_sum_float", &err);;
-    channel_div_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_div_float", &err);;
-    channel_dot_kernel = clCreateKernel(amdDevice.Program, "kernel_channel_dot_float", &err);;
-}
 
 template <typename Dtype>
 SoftmaxLayer<Dtype>::~SoftmaxLayer(){
-  clReleaseKernel(channel_max_kernel);
-  clReleaseKernel(channel_subtract_kernel);
-  clReleaseKernel(exp_kernel);
-  clReleaseKernel(channel_sum_kernel);
-  clReleaseKernel(channel_div_kernel);
-  clReleaseKernel(channel_dot_kernel);
 }
 
 template <typename Dtype>
@@ -122,22 +106,22 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // compute max
   // NOLINT_NEXT_LINE(whitespace/operators)
  
-  kernel_channel_max<Dtype>(channel_max_kernel, outer_num_, channels, inner_num_, top_data,
+  kernel_channel_max<Dtype>(outer_num_, channels, inner_num_, top_data,
       scale_data);
   // subtract
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype>(channel_subtract_kernel, count, outer_num_, channels, inner_num_,
+  kernel_channel_subtract<Dtype>(count, outer_num_, channels, inner_num_,
       scale_data, top_data);
   // exponentiate
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype>(exp_kernel, count, top_data, top_data);
+  kernel_exp<Dtype>(count, top_data, top_data);
   // sum after exp
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype>(channel_sum_kernel, outer_num_, channels, inner_num_, top_data,
+  kernel_channel_sum<Dtype>(outer_num_, channels, inner_num_, top_data,
       scale_data);
   // divide
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype>(channel_div_kernel, count, outer_num_, channels, inner_num_,
+  kernel_channel_div<Dtype>(count, outer_num_, channels, inner_num_,
       scale_data, top_data);
 }
 
@@ -154,10 +138,10 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
   // NOLINT_NEXT_LINE(whitespace/operators)
  
-  kernel_channel_dot<Dtype>(channel_dot_kernel, outer_num_, channels, inner_num_,
+  kernel_channel_dot<Dtype>(outer_num_, channels, inner_num_,
       top_diff, top_data, scale_data);
   // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype>(channel_subtract_kernel, count, outer_num_, channels, inner_num_,
+  kernel_channel_subtract<Dtype>(count, outer_num_, channels, inner_num_,
       scale_data, bottom_diff);
   // elementwise multiplication
   caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 4b091d3a..a3cca01c 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -34,22 +34,12 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::ocl_setup(){
-   cl_int err=0;
-   scal_kernel = clCreateKernel(amdDevice.Program, "scal_float", &err);
-   diff_kernel = clCreateKernel(amdDevice.Program, "diff_float", &err);
-   softmax_kernel = clCreateKernel(amdDevice.Program, "softmax_float", &err);
    d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL);
 
-   softmax_loss_fp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_fp_float", &err);
-   softmax_loss_bp_kernel = clCreateKernel(amdDevice.Program, "softmax_loss_bp_float", &err);
 }
 
 template <typename Dtype>
 SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer(){
-  clReleaseKernel(diff_kernel);
-  clReleaseKernel(scal_kernel);
-  clReleaseKernel(softmax_loss_fp_kernel);
-  clReleaseKernel(softmax_loss_bp_kernel);
 }
 
 template <typename Dtype>
@@ -158,7 +148,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   // to avoid having to allocate additional GPU memory.
   Dtype* counts = prob_.mutable_gpu_diff();
   // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype>(softmax_loss_fp_kernel, nthreads, prob_data, label, loss_data,
+  SoftmaxLossForwardGPU<Dtype>( nthreads, prob_data, label, loss_data,
        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
   Dtype loss;
   caffe_gpu_asum(nthreads, loss_data, &loss);
@@ -195,7 +185,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     // we use to to avoid allocating new GPU memory.
     Dtype* counts = prob_.mutable_gpu_diff();
     // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype>(softmax_loss_bp_kernel, nthreads, top_data, label, bottom_diff,
+    SoftmaxLossBackwardGPU<Dtype>(nthreads, top_data, label, bottom_diff,
            outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
     const Dtype loss_weight = top[0]->cpu_diff()[0];
     if (normalize_) {
diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl
index 7014721b..bc5eabff 100644
--- a/src/caffe/ocl/OCL_kernel.cl
+++ b/src/caffe/ocl/OCL_kernel.cl
@@ -718,9 +718,9 @@ __kernel void PRNG_threefry4x32(
 }
 
 
-template __attribute__((mangled_name(RNGBernoulliFloat))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
+template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
 
-template __attribute__((mangled_name(RNGBernoulliDouble))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
+template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
 
 //end of the looooooong gpu_random_generator kernel 
 
@@ -733,8 +733,8 @@ __kernel void OCL_memset(__global T* buffer, const T value, const int size){
 	}
 }
 
-template __attribute__((mangled_name(oclmemfloat))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
-template __attribute__((mangled_name(oclmemdouble))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
+template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
+template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
 
 __kernel void OCL_memset2(__global int* buffer, const int value, const int size){
         int gdx = get_global_id(0);
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
index 3b1c479b..8d3db447 100644
--- a/src/caffe/ocl/dropout_layer.cl
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -4,8 +4,8 @@ __kernel void DropoutForward(const int n, __global T *in, __global const int* ma
     if (index < n)
         out[index] = in[index] * scale * mask[index];
 }
-template __attribute__((mangled_name(DropoutForwardfloat))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
-template __attribute__((mangled_name(DropoutForwarddouble))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
+template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
+template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
 
 
 template <class T>
@@ -14,5 +14,5 @@ __kernel void DropoutBackward(const int n, __global T *in_diff, __global const i
     if (index < n)
         out_diff[index] = in_diff[index] * scale * mask[index];
 }
-template __attribute__((mangled_name(DropoutBackwardfloat))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
-template __attribute__((mangled_name(DropoutBackwarddouble))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
+template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
+template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 5ac4bd52..80289b68 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -33,8 +33,8 @@ __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const
     }
   }
 }
-template __attribute__((mangled_name(MaxPoolForwardfloat))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
-template __attribute__((mangled_name(MaxPoolForwarddouble))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
+template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
+template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
 
 template <class T>
 __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
@@ -103,8 +103,8 @@ __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, c
     }
     }
 }
-template __attribute__((mangled_name(StoPoolForwardTrainfloat))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTrainDouble))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
 
 template <class T>
 __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
@@ -132,8 +132,8 @@ __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const
     }
     top_data[index] = cumvalues / cumsum;  }
 }
-template __attribute__((mangled_name(StoPoolForwardTestfloat))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
-template __attribute__((mangled_name(StoPoolForwardTestdouble))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
+template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
 template <class T>
 __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
@@ -182,8 +182,8 @@ __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
     bottom_diff[index] = gradient;
   }
 }
-template __attribute__((mangled_name(MaxPoolBackwardfloat))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
-template __attribute__((mangled_name(MaxPoolBackwarddouble))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class T>
 __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){
@@ -215,8 +215,8 @@ __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const in
    }
 }
 
-template __attribute__((mangled_name(AvePoolBackwardfloat))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
-template __attribute__((mangled_name(AvePoolBackwarddouble))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class Dtype>
 void StoPoolBackward(const int nthreads,
@@ -253,13 +253,13 @@ void StoPoolBackward(const int nthreads,
 
 	  }
 }
-template __attribute__ ((mangled_name(StoPoolBackwardfloat))) __kernel  void StoPoolBackward<float>(const int nthreads,
+template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel  void StoPoolBackward<float>(const int nthreads,
     __global float* rand_idx, __global float* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, __global float* bottom_diff);
-template __attribute__ ((mangled_name(StoPoolBackwarddouble))) __kernel void StoPoolBackward<double>(const int nthreads,
+template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward<double>(const int nthreads,
     __global double* rand_idx, __global double* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index c9ba4900..df26d66e 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -5,8 +5,8 @@ __kernel void ReLUForward(const int count, __global T* in, __global T* out, T ne
 		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
 }
 
-template __attribute__ ((mangled_name(ReLUForwardFloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwardDouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
+template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
 
 template <class T>
 __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
@@ -15,5 +15,5 @@ __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_
 		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
 }
 
-template __attribute__ ((mangled_name(ReLUBackwardFloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
-template __attribute__ ((mangled_name(ReLUBackwardDouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
+template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackward_double))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index 6d6e4f0b..97eb6874 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -20,12 +20,12 @@ __kernel void SoftmaxLossForwardGPU(const int nthreads,
   }
 }
 
-template __attribute__ ((mangled_name(softmax_loss_fp_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
           __global float* prob_data, __global float* label,__global float* loss,
           int num, int dim, int spatial_dim,
           bool has_ignore_label_, int ignore_label_,
           __global float* counts);
-template __attribute__ ((mangled_name(softmax_loss_fp_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
           __global double* prob_data, __global double* label,__global double* loss,
           int num, int dim, int spatial_dim,
           bool has_ignore_label_, int ignore_label_,
@@ -54,12 +54,12 @@ __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
     }
   }
 }
-template __attribute__ ((mangled_name(softmax_loss_bp_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
+template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
           __global float* label,__global float* bottom_diff, int num, int dim,
           int spatial_dim, bool has_ignore_label_,
           int ignore_label_, float* counts);
 
-template __attribute__ ((mangled_name(softmax_loss_bp_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+template __attribute__ ((mangled_name(SoftmaxLossBackward_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
           __global double* label,__global double* bottom_diff, int num, int dim,
           int spatial_dim, bool has_ignore_label_,
           int ignore_label_, double* counts);
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 715297a6..63c8294c 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -766,7 +766,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   case Caffe::GPU: {
 #ifndef CPU_ONLY
     // compute square of gradient in update
-    caffe_gpu_powx(powx_kernel, net_params[param_id]->count(),
+    caffe_gpu_powx(net_params[param_id]->count(),
         net_params[param_id]->gpu_diff(), Dtype(2),
         this->update_[param_id]->mutable_gpu_data());
 
@@ -777,14 +777,14 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->history_[param_id]->mutable_gpu_data());
 
     // prepare update
-    caffe_gpu_powx(powx_kernel, net_params[param_id]->count(),
+    caffe_gpu_powx( net_params[param_id]->count(),
               this->history_[param_id]->gpu_data(), Dtype(0.5),
               this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_add_scalar<Dtype>(scalar_kernel, net_params[param_id]->count(),
+    caffe_gpu_add_scalar<Dtype>(net_params[param_id]->count(),
              delta, this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_div(div_kernel, net_params[param_id]->count(),
+    caffe_gpu_div(net_params[param_id]->count(),
               net_params[param_id]->gpu_diff(),
               this->update_[param_id]->gpu_data(),
               this->update_[param_id]->mutable_gpu_data());
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index eef9f544..044f9e69 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -8,10 +8,12 @@
 #include "caffe/common.hpp"
 #include "caffe/util/ocl_util.hpp"
 namespace caffe {
-
+template <typename dtype> extern std::string get_dtype_suffix();
 
 template <typename Dtype>
-void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int count){
+void ocl_memset(Dtype* buffer, const Dtype value, const int count){
+    std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int err=0;
     err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
     err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value);
@@ -25,8 +27,8 @@ void ocl_memset(cl_kernel Kernel, Dtype* buffer, const Dtype value, const int co
 }
 
 // Explicit instantiation
-template void ocl_memset<float>(cl_kernel Kernel, float* buffer, const float value, const int count);
-template void ocl_memset<double>(cl_kernel Kernel, double* buffer, const double value, const int count);
+template void ocl_memset<float>(float* buffer, const float value, const int count);
+template void ocl_memset<double>(double* buffer, const double value, const int count);
 
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index f5f7e945..a9563c14 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -11,8 +11,27 @@
 namespace caffe {
 typedef unsigned int uint32_t;
 struct array4x32 {  uint32_t v[4]; };
+
+template <typename dtype> std::string get_dtype_suffix()
+{
+    dtype x;
+    const char type = typeid(x).name()[0];
+    std::string suffix;
+    switch(type){
+        case 'i': suffix = "_int"; break;
+        case 'd': suffix = "_double"; break;
+        case 'f': 
+        default: suffix = "_float";
+    }
+    return suffix;
+}
+
 template <typename Dtype>
-void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold){
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold)
+{
+        std::string kernel_name = std::string("RNGBernoulli") + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
         static unsigned c = 0;
         unsigned nrounds = 20;
         array4x32  rndctr4;
@@ -33,8 +52,8 @@ void caffe_gpu_bernoulli(cl_kernel ker_rand, int* a, const unsigned int n, Dtype
         size_t localws[1] = {256};
         OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
 }
-template void caffe_gpu_bernoulli<float>(cl_kernel kernel, int* a, const unsigned int n, float inf, float sup, float threshold);
-template void caffe_gpu_bernoulli<double>(cl_kernel kernel, int* a, const unsigned int n, double inf, double sup, double threshold);
+template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n, float inf, float sup, float threshold);
+template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n, double inf, double sup, double threshold);
 
 
 template <typename Dtype>
@@ -134,9 +153,12 @@ template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim
 template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
 template <typename Dtype>
-void kernel_channel_max(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_max(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* out)
 {
+    std::string kernel_name = std::string("kernel_channel_max") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
@@ -148,15 +170,19 @@ void kernel_channel_max(cl_kernel Kernel, const int num, const int channels,
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template void kernel_channel_max<float>(cl_kernel Kernel, const int num, const int channels,
+template void kernel_channel_max<float>( const int num, const int channels,
     const int spatial_dim, const float* data, float* out);
-template void kernel_channel_max<double>(cl_kernel Kernel, const int num, const int channels,
+template void kernel_channel_max<double>( const int num, const int channels,
     const int spatial_dim, const double* data, double* out);
 
 template <typename Dtype>
-void kernel_channel_subtract(cl_kernel Kernel, const int count,
+void kernel_channel_subtract( const int count,
     const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data){
+    const int spatial_dim, const Dtype* channel_max, Dtype* data)
+{
+    std::string kernel_name = std::string("kernel_channel_subtract") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
@@ -169,16 +195,19 @@ void kernel_channel_subtract(cl_kernel Kernel, const int count,
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template void kernel_channel_subtract<float>(cl_kernel Kernel, const int count,
+template void kernel_channel_subtract<float>( const int count,
     const int num, const int channels,
     const int spatial_dim, const float* channel_max, float* data);
-template void kernel_channel_subtract<double>(cl_kernel Kernel, const int count,
+template void kernel_channel_subtract<double>( const int count,
     const int num, const int channels,
     const int spatial_dim, const double* channel_max, double* data);
 
 template <typename Dtype>
-void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out)
+void kernel_exp(const int count, const Dtype* data, Dtype* out)
 {
+    std::string kernel_name = std::string("kernel_exp") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
@@ -188,13 +217,16 @@ void kernel_exp(cl_kernel Kernel, const int count, const Dtype* data, Dtype* out
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template void kernel_exp<float>(cl_kernel Kernel, const int count, const float* data, float* out);
-template void kernel_exp<double>(cl_kernel Kernel, const int count, const double* data, double* out);
+template void kernel_exp<float>(const int count, const float* data, float* out);
+template void kernel_exp<double>(const int count, const double* data, double* out);
 
 template <typename Dtype>
-void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_sum(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* channel_sum)
 {
+    std::string kernel_name = std::string("kernel_channel_sum") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
@@ -206,13 +238,16 @@ void kernel_channel_sum(cl_kernel Kernel, const int num, const int channels,
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template void kernel_channel_sum<float>(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum);
-template void kernel_channel_sum<double>(cl_kernel Kernel, const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum);
+template void kernel_channel_sum<float>(const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum);
+template void kernel_channel_sum<double>(const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum);
 
 template <typename Dtype>
-void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const int channels,
+void kernel_channel_div(const int count, const int num, const int channels,
     const int spatial_dim, const Dtype* channel_sum, Dtype* data)
 {
+    std::string kernel_name = std::string("kernel_channel_div") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
@@ -225,16 +260,19 @@ void kernel_channel_div(cl_kernel Kernel, const int count, const int num, const
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template  void kernel_channel_div<float>(cl_kernel Kernel, const int count, const int num, const int channels,
+template  void kernel_channel_div<float>(const int count, const int num, const int channels,
     const int spatial_dim, const float* channel_sum, float* data);
-template  void kernel_channel_div<double>(cl_kernel Kernel, const int count, const int num, const int channels,
+template  void kernel_channel_div<double>(const int count, const int num, const int channels,
     const int spatial_dim, const double* channel_sum, double* data);
 
 template <typename Dtype>
-void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels,
+void kernel_channel_dot(const int num, const int channels,
     const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
     Dtype* channel_dot)
 {
+    std::string kernel_name = std::string("kernel_channel_dot") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
@@ -247,19 +285,22 @@ void kernel_channel_dot(cl_kernel Kernel, const int num, const int channels,
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-template void kernel_channel_dot<float>(cl_kernel Kernel, const int num, const int channels,
+template void kernel_channel_dot<float>(const int num, const int channels,
     const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot);
-template void kernel_channel_dot<double>(cl_kernel Kernel, const int num, const int channels,
+template void kernel_channel_dot<double>(const int num, const int channels,
     const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot);
 
 
 template <typename Dtype>
-void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
+void SoftmaxLossForwardGPU(const int nthreads,
           const Dtype* prob_data, const Dtype* label, Dtype* loss,
           const int num, const int dim, const int spatial_dim,
           const bool has_ignore_label_, const int ignore_label_,
           Dtype* counts)
 {
+    std::string kernel_name = std::string("SoftmaxLossForwardGPU") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
     OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&prob_data));
     OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
@@ -276,17 +317,20 @@ void SoftmaxLossForwardGPU(cl_kernel Kernel, const int nthreads,
    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void SoftmaxLossForwardGPU<float>(cl_kernel Kernel, const int nthreads, const float* prob_data, const float* label, float* loss,
+template void SoftmaxLossForwardGPU<float>(const int nthreads, const float* prob_data, const float* label, float* loss,
           const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts);
-template void SoftmaxLossForwardGPU<double>(cl_kernel Kernel, const int nthreads, const double* prob_data, const double* label, double* loss,
+template void SoftmaxLossForwardGPU<double>(const int nthreads, const double* prob_data, const double* label, double* loss,
           const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts);
 
 template <typename Dtype>
-void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* top,
+void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
           const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
           const int spatial_dim, const bool has_ignore_label_,
           const int ignore_label_, Dtype* counts)
 {
+    std::string kernel_name = std::string("SoftmaxLossBackwardGPU") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
     OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&top));
     OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
@@ -303,9 +347,9 @@ void SoftmaxLossBackwardGPU(cl_kernel Kernel, const int nthreads, const Dtype* t
    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void SoftmaxLossBackwardGPU<float>(cl_kernel Kernel, const int nthreads, const float* top, const float* label, float* bottom_diff, 
+template void SoftmaxLossBackwardGPU<float>(const int nthreads, const float* top, const float* label, float* bottom_diff, 
                        const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts);
-template void SoftmaxLossBackwardGPU<double>(cl_kernel Kernel, const int nthreads, const double* top, const double* label, double* bottom_diff, 
+template void SoftmaxLossBackwardGPU<double>(const int nthreads, const double* top, const double* label, double* bottom_diff, 
                        const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts);
 
 template <typename Dtype>
@@ -364,8 +408,11 @@ template  void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const f
 template  void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data);
 
 template <typename Dtype>
-void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){
-    cl_int ret;
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){
+     std::string kernel_name = std::string("MaxPoolForward") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+   
+     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
@@ -390,11 +437,14 @@ void MaxPoolForward(cl_kernel Kernel, const int count, const Dtype* bottom_data,
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void MaxPoolForward<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask);
-template void MaxPoolForward<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask);
+template void MaxPoolForward<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask);
+template void MaxPoolForward<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask);
 
 template <typename Dtype>
-void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data){
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data)
+{
+    std::string kernel_name = std::string("StoPoolForwardTrain") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
@@ -416,11 +466,14 @@ void StoPoolForwardTrain(cl_kernel Kernel,const int count, const Dtype* bottom_d
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void StoPoolForwardTrain<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
-template void StoPoolForwardTrain<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
+template void StoPoolForwardTrain<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+template void StoPoolForwardTrain<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
 
 template <typename Dtype>
-void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){
+void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){
+    std::string kernel_name = std::string("StoPoolForwardTest") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
@@ -442,11 +495,13 @@ void StoPoolForwardTest(cl_kernel Kernel,const int count, const Dtype* bottom_da
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
-template void StoPoolForwardTest<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data);
-template void StoPoolForwardTest<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data);
+template void StoPoolForwardTest<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data);
+template void StoPoolForwardTest<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data);
 
 template <typename Dtype>
-void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){
+        std::string kernel_name = std::string("AvePoolForward") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
@@ -469,8 +524,8 @@ void AvePoolForward(cl_kernel Kernel,const int count, const Dtype* bottom_data,
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
-template void AvePoolForward<float>(cl_kernel Kernel,const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data);
-template void AvePoolForward<double>(cl_kernel Kernel,const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data);
+template void AvePoolForward<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data);
+template void AvePoolForward<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data);
 
 template <typename Dtype> 
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){
@@ -524,7 +579,9 @@ template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const fl
 template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff );
 
 template <typename Dtype>
-void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){
+        std::string kernel_name = std::string("MaxPoolBackward") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
@@ -550,12 +607,15 @@ void MaxPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const to
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
-template void MaxPoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
-template void MaxPoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
+template void MaxPoolBackward<float>(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
+template void MaxPoolBackward<double>(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
 
 template <typename Dtype>
-void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff)
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff)
 {
+    std::string kernel_name = std::string("AvePoolBackward") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
@@ -578,11 +638,13 @@ void AvePoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const to
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
-template void AvePoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
-template void AvePoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
+template void AvePoolBackward<float>(const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
+template void AvePoolBackward<double>(const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
 
 template <typename Dtype>
-void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){
+void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){
+        std::string kernel_name = std::string("StoPoolBackward") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx);
@@ -604,8 +666,8 @@ void StoPoolBackward(cl_kernel Kernel, const int nthreads, const Dtype* const ra
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
-template void StoPoolBackward<float>(cl_kernel kernel, const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff);
-template void StoPoolBackward<double>(cl_kernel kernel, const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff);
+template void StoPoolBackward<float>(const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff);
+template void StoPoolBackward<double>(const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff);
 
 template <typename Dtype> 
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){
@@ -634,9 +696,7 @@ template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const d
 
 template <typename Dtype> 
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
-    Dtype type;
-    std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double";
-    std::string kernel_name = std::string("ReLUForward")+str_type;
+    std::string kernel_name = std::string("ReLUForward") + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
@@ -654,9 +714,7 @@ template void ReLUForward<double>(const int count, const double* bottom_data, do
 
 template <typename Dtype> 
 void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
-    Dtype type;
-    std::string str_type = (typeid(type).name()[0]=='f')?"Float":"Double";
-    std::string kernel_name = std::string("ReLUBackward")+str_type;
+    std::string kernel_name = std::string("ReLUBackward") + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
   
     cl_int ret;
@@ -824,7 +882,9 @@ template void caffe_gpu_sign<float>(cl_kernel Kernel,const int N,  const float*
 template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double* X, double* Y );
 
 template <typename Dtype>
-void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    std::string kernel_name = std::string("div") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
@@ -836,11 +896,13 @@ void caffe_gpu_div (cl_kernel Kernel, const int n, const Dtype* a, const Dtype*
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_div<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_div<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+template void caffe_gpu_div<float> (const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_div<double> (const int n, const double* a, const double* b, double* y);
 
 template <typename Dtype>
-void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtype* top_data){
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
+     std::string kernel_name = std::string("add_scalar") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha);
@@ -851,11 +913,14 @@ void caffe_gpu_add_scalar(cl_kernel Kernel, const int n, const Dtype alpha, Dtyp
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_add_scalar<float> (cl_kernel Kernel, const int n, const float alpha, float* top_data);
-template void caffe_gpu_add_scalar<double> (cl_kernel Kernel, const int n, const double alpha, double* top_data);
+template void caffe_gpu_add_scalar<float> (const int n, const float alpha, float* top_data);
+template void caffe_gpu_add_scalar<double> (const int n, const double alpha, double* top_data);
 
 template <typename Dtype>
-void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype* b, Dtype* y){
+void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
+        std::string kernel_name = std::string("element_mul") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
@@ -867,11 +932,13 @@ void caffe_gpu_mul (cl_kernel Kernel, const int n, const Dtype* a, const Dtype*
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_mul<float> (cl_kernel Kernel, const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_mul<double> (cl_kernel Kernel, const int n, const double* a, const double* b, double* y);
+template void caffe_gpu_mul<float> (const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_mul<double> (const int n, const double* a, const double* b, double* y);
 
 template <typename Dtype>
-void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+       std::string kernel_name = std::string("powx") + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
@@ -883,12 +950,15 @@ void caffe_gpu_powx (cl_kernel Kernel, const int n, const Dtype* a, const Dtype
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_powx<float> (cl_kernel Kernel, const int n, const float* a, const float alpha, float* y);
-template void caffe_gpu_powx<double> (cl_kernel Kernel, const int n, const double* a, const double alpha, double* y);
+template void caffe_gpu_powx<float> (const int n, const float* a, const float alpha, float* y);
+template void caffe_gpu_powx<double> (const int n, const double* a, const double alpha, double* y);
 
 template <typename Dtype>
-void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
+void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
 {
+    std::string kernel_name = std::string("DropoutForward") + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
     ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
@@ -902,12 +972,15 @@ void DropoutForward(cl_kernel kernel, const int count, const Dtype* bottom_data,
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void DropoutForward<float>(cl_kernel kernel, const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
-template void DropoutForward<double>(cl_kernel kernel, const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+template void DropoutForward<float>(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
+template void DropoutForward<double>(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
 
 template <typename Dtype>
-void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
 {
+    std::string kernel_name = std::string("DropoutBackward") + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
     ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
@@ -921,7 +994,7 @@ void DropoutBackward(cl_kernel kernel, const int count, const Dtype* top_diff, c
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void DropoutBackward<float>(cl_kernel kernel, const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
-template void DropoutBackward<double>(cl_kernel kernel, const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
+template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
+template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
 }  // namespace caffe
 

From b904fcfbe9a638058263f0cd10538338f943ca02 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 9 Aug 2015 17:09:01 +0800
Subject: [PATCH 032/124] clean the ocl wrappers in conv_layer; check the type
 of files to be built in ./src/caffe/ocl

---
 include/caffe/util/im2col.hpp        |  8 ++--
 include/caffe/util/ocl_wrapper.hpp   |  4 +-
 include/caffe/vision_layers.hpp      | 16 +++----
 src/caffe/device.cpp                 | 10 +++--
 src/caffe/layers/base_conv_layer.cpp | 16 ++++---
 src/caffe/ocl/OCL_kernel.cl          |  8 ++--
 src/caffe/ocl/im2col.cl              | 20 ++++-----
 src/caffe/util/im2col.cpp            | 45 ++++++++++++-------
 src/caffe/util/ocl_wrapper.cpp       | 66 +++++++++++++++-------------
 9 files changed, 108 insertions(+), 85 deletions(-)

diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 5eb28f9a..aec9e330 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -16,7 +16,7 @@ void col2im_cpu(const Dtype* data_col, const int channels,
     const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
-void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+void col2im_gpu(const Dtype* data_col, const int col_offset,
     const int height, const int width, const int channels,
     const int patch_h, const int patch_w,
     const int pad_h, const int pad_w,
@@ -24,7 +24,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
     Dtype* data_im, const int img_offset);
 
 template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel,  const Dtype* data_im, const int img_offset, const int channels,
+void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
@@ -53,7 +53,7 @@ void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
     const int stride, Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
-void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_col, const int col_offset, int optnum);
 
@@ -63,7 +63,7 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c
     const int stride, Dtype* data_im, const int img_offset);
 
 template <typename Dtype>
-void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_im, const int img_offset, int optnum);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 71e13b2e..dbe2eb49 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -10,10 +10,10 @@ typedef unsigned int uint32_t;
 //void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
 template <typename Dtype>
-void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
 
 template <typename Dtype>
-void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
+void opttrans(const Dtype* data_im, const int im_offset, const int channels,
     const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum);
 
 template <typename Dtype>
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index a1c9577d..237e9cbf 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -98,26 +98,26 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   }
 #ifndef CPU_ONLY
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-     im2col_gpu(im2col_gpu_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
+     im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0);
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col2im_gpu_kernel, col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
+    col2im_gpu(col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
   }
   inline void conv_im2col_gpu_opt(const Dtype* data) {
-     im2col_gpu_opt(im2col_opt_kernel, data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
+     im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
   }
   inline void conv_col2im_gpu_opt( Dtype* data) {
-    col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
+    col2im_gpu_opt((Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 }
   inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
-    transform_gpu(ocl_Kernel_transform, (Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
+    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
 }
  inline void conv_transpose_gpu(const Dtype* data){
-    opttrans(opttrans_kernel, data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
+    opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 }
 protected:
   inline void gpu_memset(Dtype* data, Dtype value, int count) {
@@ -142,10 +142,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top,  bool skip_im2col = false) ;
   void backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  cl_kernel im2col_gpu_kernel, col2im_gpu_kernel;
-  cl_kernel im2col_opt_kernel, col2im_opt_kernel, opttrans_kernel;
-  cl_kernel oclmem_kernel;
-  cl_kernel ocl_Kernel_transpose, ocl_Kernel_transform;
   int opt_num2;
   int M_, N_, K_;
   int weight_offset_;
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index c10ddf25..23c3789b 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -182,9 +182,13 @@ void Device::BuildProgram(std::string kernel_dir)
     while((dirp = readdir(ocl_dir)) != NULL)
     {  
         //Ignore hidden files
-        if(dirp->d_name[0] == '.')
-            continue;
-        std::string ocl_kernel_full_path=kernel_dir+std::string(dirp->d_name);
+        if(dirp->d_name[0] == '.') continue;
+        std::string file_name = std::string(dirp->d_name);
+        //Skip non *.cl files
+        size_t last_dot_pos = file_name.find_last_of(".");
+        if(file_name.substr(last_dot_pos+1) != "cl") continue;
+
+        std::string ocl_kernel_full_path=kernel_dir+file_name;
         std::string tmpSource = "";
         ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
         strSource += tmpSource;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ea4a1658..1c1379b3 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -33,7 +33,7 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-  im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL);
+/*  im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL);
   col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL);
   oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
   im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
@@ -41,7 +41,7 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
   opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL);
   ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL);
   ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL);
-
+*/
   M_ = conv_out_channels_ / group_;
   K_ = kernel_dim_ / group_;
   N_ =  conv_out_spatial_dim_;
@@ -56,6 +56,7 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 
 template <typename Dtype>
  BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
+ /*
   OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) );
   OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) );
   OCL_CHECK( clReleaseKernel(oclmem_kernel) );
@@ -63,6 +64,7 @@ template <typename Dtype>
   OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
   OCL_CHECK( clReleaseKernel(im2col_opt_kernel) );
   OCL_CHECK( clReleaseKernel(col2im_opt_kernel) );
+*/
 }
 
 
@@ -495,7 +497,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
     col_offset = K_ * N_ * opt_num2;
     //step1: packed im2col, col_size = (K_ * group_ ) * N_
     //this should be opt_num2 images packing together.
-    im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
     //step 2: sgemm: Top (subTopMem) = weight * col_data
@@ -520,7 +522,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
        }
 #endif
     //step 3: tranform
-    transform_gpu(ocl_Kernel_transform, (Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
+    transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
     //step 4: add bias
     /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/
 
@@ -578,13 +580,13 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
     col_offset = K_ * (N_ * opt_num2);
     //step1: packed im2col, col_size = (K_ * group_ ) * N_
     //this should be opt_num2 images packing together.
-    im2col_gpu_opt(im2col_opt_kernel, bottom_data, bottom[i]->offset(n), channels_, height_,
+    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
     //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize
     int height_top = M_ * group_, width_top = N_;
     //if (opt_num2 >1)
-    opttrans(opttrans_kernel, top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
+    opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
 
     //step 3: sgemm: Top (subTopMem) = weight * col_data
     for(g = 0; g < group_; ++g) {
@@ -624,7 +626,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
 #endif
 
     //step5: col2im
-       col2im_gpu_opt(col2im_opt_kernel, (Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+       col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
                   stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
 #ifdef Track_layer
     LOG(WARNING) << "conv bp done";
diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/OCL_kernel.cl
index bc5eabff..a014a5cf 100644
--- a/src/caffe/ocl/OCL_kernel.cl
+++ b/src/caffe/ocl/OCL_kernel.cl
@@ -981,8 +981,8 @@ __kernel void transpose(__global const T *src, __global T* dst, int width, int h
      if( gidx < width && gidyy < height * optnum )
          dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
 }
-template __attribute__((mangled_name(transposefloat))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
-template __attribute__((mangled_name(transposedouble))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
+template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
+template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
 
 template <class T>
 __kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
@@ -995,5 +995,5 @@ __kernel void transform(__global const T *src, __global T* dst, int top_offset,
      for(i = 0 ; i < width; i++)
          dst[(index * height + offset)* width + i] = src[gidx * width + i];
 }
-template __attribute__((mangled_name(transformfloat))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
-template __attribute__((mangled_name(transformdouble))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 577dd58f..728f8dd3 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -63,8 +63,8 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c
     }
 }
 
-template __attribute__((mangled_name(im2col_optfloat))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
-template __attribute__((mangled_name(im2col_optdouble))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
+template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
+template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
 
 
 template <class T>
@@ -102,11 +102,11 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in
   }
 }
 
-template __attribute__((mangled_name(im2col_gpu_float_kernel))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
+template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
            const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
            const int pad_h, const int pad_w, const int stride_h, const int stride_w,
            const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2col_gpu_double_kernel)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+template __attribute__((mangled_name(im2col_gpu_kernel_double)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
            const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
            const int pad_h, const int pad_w, const int stride_h, const int stride_w,
            const int height_col, const int width_col, __global double* data_col, const int col_offset);
@@ -146,12 +146,12 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i
   }
 }
 
-template __attribute__((mangled_name(col2im_gpu_float_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
+template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
     									const int height, const int width, const int channels,
     									const int patch_h, const int patch_w,const int pad_h, const int pad_w,
     									const int stride_h, const int stride_w,const int height_col, const int width_col,
     									__global float* data_im, const int img_offset);
-template __attribute__((mangled_name(col2im_gpu_double_kernel))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
+template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
                                          const int col_offset, const int height, const int width, const int channels,
                                          const int patch_h, const int patch_w, const int pad_h, const int pad_w,
                                          const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
@@ -245,8 +245,8 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset
       data_im[index] = val;
   }
 }
-template __attribute__((mangled_name(col2im_optfloat))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
-template __attribute__((mangled_name(col2im_optdouble))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
+template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
+template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
 
 
 template <class T>
@@ -294,5 +294,5 @@ __kernel void opttrans(const int n, __global T* data_im, const int im_offset, co
       data_opt[opt_index] = data_im[index];
     }
 }
-template __attribute__((mangled_name(opttransfloat))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
-template __attribute__((mangled_name(opttransdouble))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
+template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
+template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 4d28ab1e..a5eb4176 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -8,6 +8,8 @@
 
 namespace caffe {
 
+template <typename dtype> extern std::string get_dtype_suffix();
+
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -83,9 +85,11 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
 
 
 template <typename Dtype>
-void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_im, const int img_offset, int optnum){
+    std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     int height_col = (height + 2 * pad - ksize) / stride + 1;
     int width_col = (width + 2 * pad - ksize) / stride + 1;
     int num_kernels = channels * height * width;
@@ -112,21 +116,24 @@ void col2im_gpu_opt(cl_kernel Kernel, const Dtype* data_col, const int col_offse
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void col2im_gpu_opt<float>(cl_kernel kernel, const float* data_col, const int col_offset, const int channels,
+template void col2im_gpu_opt<float>(const float* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, float* data_im, const int img_offset, int optnum);
-template void col2im_gpu_opt<double>(cl_kernel kernel, const double* data_col, const int col_offset, const int channels,
+template void col2im_gpu_opt<double>(const double* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, double* data_im, const int img_offset, int optnum);
 
 //cannot use now, need to modify kernel.
 template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel,  const Dtype* data_im, const int img_offset, const int channels, 
+void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, 
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     Dtype* data_col, const int col_offset)
 {
+    std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
     int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
     int num_kernels = channels * height_col * width_col;
@@ -156,24 +163,27 @@ void im2col_gpu(cl_kernel Kernel,  const Dtype* data_im, const int img_offset, c
 
 }
 
-template void im2col_gpu<float>(cl_kernel Kernel,  const float* data_im, const int img_offset, const int channels,       
+template void im2col_gpu<float>(const float* data_im, const int img_offset, const int channels,       
     				const int height, const int width, const int kernel_h, const int kernel_w,
     				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     				float* data_col, const int col_offset);
-template void im2col_gpu<double>(cl_kernel Kernel,  const double* data_im, const int img_offset, const int channels,       
+template void im2col_gpu<double>(const double* data_im, const int img_offset, const int channels,       
     				const int height, const int width, const int kernel_h, const int kernel_w,
     				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     				double* data_col, const int col_offset);
 
 //cannot use now, need to modify kernel
 template <typename Dtype>
-void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+void col2im_gpu(const Dtype* data_col, const int col_offset,
     const int height, const int width, const int channels,
     const int patch_h, const int patch_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     Dtype* data_im, const int img_offset)
 {
+    std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
     int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
     int num_kernels = channels * height * width;
@@ -201,11 +211,11 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void col2im_gpu<float>(cl_kernel Kernel, const float* data_col, const int col_offset,
+template void col2im_gpu<float>(const float* data_col, const int col_offset,
     				const int height, const int width, const int channels,
     				const int patch_h, const int patch_w, const int pad_h, const int pad_w,
     				const int stride_h, const int stride_w, float* data_im, const int img_offset);
-template void col2im_gpu<double>(cl_kernel Kernel, const double* data_col, const int col_offset,
+template void col2im_gpu<double>(const double* data_col, const int col_offset,
     				const int height, const int width, const int channels,
     				const int patch_h, const int patch_w,
     				const int pad_h, const int pad_w,const int stride_h, const int stride_w,
@@ -285,10 +295,13 @@ template void im2col_16_gpu<double>(cl_kernel Kernel, const double* data_im, con
     const int stride, double* data_col, const int col_offset);
 
 template <typename Dtype>
-void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_col, const int col_offset, int optnum) {
 
+    std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    
     int height_col = (height + 2 * pad - ksize) / stride + 1;
     int width_col = (width + 2 * pad - ksize) / stride + 1;
     int num_kernels = optnum * channels * height_col * width_col;
@@ -315,17 +328,19 @@ void im2col_gpu_opt(cl_kernel Kernel, const Dtype* data_im, const int img_offset
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void im2col_gpu_opt<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
+template void im2col_gpu_opt<float>(const float* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, float* data_col, const int col_offset, int optnum);
-template void im2col_gpu_opt<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
+template void im2col_gpu_opt<double>(const double* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, double* data_col, const int col_offset,  int optnum);
 
 template <typename Dtype>
-void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_im, const int img_offset) {
+    std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     int height_col = (height + 2 * pad - ksize) / stride + 1;
     int width_col = (width + 2 * pad - ksize) / stride + 1;
@@ -356,10 +371,10 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, c
 }
 
 
-template void col2im_gpu<float>(cl_kernel Kernel, const float* data_col, const int col_offset, const int channels,
+template void col2im_gpu<float>(const float* data_col, const int col_offset, const int channels,
     const int height, const int width, const int psize, const int pad,
     const int stride, float* data_im, const int img_offset);
-template void col2im_gpu<double>(cl_kernel Kernel, const double* data_col, const int col_offset, const int channels,
+template void col2im_gpu<double>(const double* data_col, const int col_offset, const int channels,
     const int height, const int width, const int psize, const int pad,
     const int stride, double* data_im, const int img_offset);
 
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index a9563c14..ac1d9958 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -29,7 +29,7 @@ template <typename dtype> std::string get_dtype_suffix()
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold)
 {
-        std::string kernel_name = std::string("RNGBernoulli") + get_dtype_suffix<Dtype>();
+        std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
         cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
 
         static unsigned c = 0;
@@ -57,7 +57,10 @@ template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n, double i
 
 
 template <typename Dtype>
-void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){
+    std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
     cl_int ret;
     ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src);
     OCL_CHECK(ret);
@@ -77,8 +80,8 @@ void transform_gpu(cl_kernel Kernel, Dtype* src, Dtype* dst, const int top_offse
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) );
 }
 
-template void transform_gpu<float>(cl_kernel Kernel, float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num);
-template void transform_gpu<double>(cl_kernel Kernel, double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+template void transform_gpu<float>(float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+template void transform_gpu<double>(double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num);
 
 template <typename Dtype>
 void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){
@@ -156,7 +159,7 @@ template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* out)
 {
-    std::string kernel_name = std::string("kernel_channel_max") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
@@ -180,7 +183,7 @@ void kernel_channel_subtract( const int count,
     const int num, const int channels,
     const int spatial_dim, const Dtype* channel_max, Dtype* data)
 {
-    std::string kernel_name = std::string("kernel_channel_subtract") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_channel_subtract" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
@@ -205,7 +208,7 @@ template void kernel_channel_subtract<double>( const int count,
 template <typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out)
 {
-    std::string kernel_name = std::string("kernel_exp") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
@@ -224,7 +227,7 @@ template <typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* channel_sum)
 {
-    std::string kernel_name = std::string("kernel_channel_sum") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
@@ -245,7 +248,7 @@ template <typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
     const int spatial_dim, const Dtype* channel_sum, Dtype* data)
 {
-    std::string kernel_name = std::string("kernel_channel_div") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
@@ -270,7 +273,7 @@ void kernel_channel_dot(const int num, const int channels,
     const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
     Dtype* channel_dot)
 {
-    std::string kernel_name = std::string("kernel_channel_dot") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
@@ -298,7 +301,7 @@ void SoftmaxLossForwardGPU(const int nthreads,
           const bool has_ignore_label_, const int ignore_label_,
           Dtype* counts)
 {
-    std::string kernel_name = std::string("SoftmaxLossForwardGPU") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
@@ -328,7 +331,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
           const int spatial_dim, const bool has_ignore_label_,
           const int ignore_label_, Dtype* counts)
 {
-    std::string kernel_name = std::string("SoftmaxLossBackwardGPU") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "SoftmaxLossBackwardGPU" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
@@ -409,7 +412,7 @@ template  void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const
 
 template <typename Dtype>
 void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){
-     std::string kernel_name = std::string("MaxPoolForward") + get_dtype_suffix<Dtype>();
+     std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
    
      cl_int ret;
@@ -443,7 +446,7 @@ template void MaxPoolForward<double>(const int count, const double* bottom_data,
 template <typename Dtype>
 void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data)
 {
-    std::string kernel_name = std::string("StoPoolForwardTrain") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
@@ -471,7 +474,7 @@ template void StoPoolForwardTrain<double>(const int count, const double* bottom_
 
 template <typename Dtype>
 void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){
-    std::string kernel_name = std::string("StoPoolForwardTest") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     cl_int ret;
@@ -500,7 +503,7 @@ template void StoPoolForwardTest<double>(const int count, const double* bottom_d
 
 template <typename Dtype>
 void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){
-        std::string kernel_name = std::string("AvePoolForward") + get_dtype_suffix<Dtype>();
+        std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
@@ -580,7 +583,7 @@ template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const d
 
 template <typename Dtype>
 void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){
-        std::string kernel_name = std::string("MaxPoolBackward") + get_dtype_suffix<Dtype>();
+        std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
@@ -613,7 +616,7 @@ template void MaxPoolBackward<double>(const int nthreads, const double* const to
 template <typename Dtype>
 void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff)
 {
-    std::string kernel_name = std::string("AvePoolBackward") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     cl_int ret;
@@ -643,7 +646,7 @@ template void AvePoolBackward<double>(const int nthreads, const double* const to
 
 template <typename Dtype>
 void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){
-        std::string kernel_name = std::string("StoPoolBackward") + get_dtype_suffix<Dtype>();
+        std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
@@ -696,7 +699,7 @@ template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const d
 
 template <typename Dtype> 
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
-    std::string kernel_name = std::string("ReLUForward") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
@@ -714,7 +717,7 @@ template void ReLUForward<double>(const int count, const double* bottom_data, do
 
 template <typename Dtype> 
 void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
-    std::string kernel_name = std::string("ReLUBackward") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
   
     cl_int ret;
@@ -731,9 +734,12 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da
 }
 template void ReLUBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
 template void ReLUBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
+
 template <typename Dtype>
-void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const int channels,
+void opttrans(const Dtype* data_im, const int im_offset, const int channels,
     const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
+    std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     int num_kernels = channels * height * width * optnum;
   // To avoid involving atomic operations, we will launch one kernel per
@@ -757,9 +763,9 @@ void opttrans(cl_kernel Kernel, const Dtype* data_im, const int im_offset, const
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
-template void opttrans<float>(cl_kernel Kernel, const float* data_im, const int im_offset, const int channels,
+template void opttrans<float>(const float* data_im, const int im_offset, const int channels,
     const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
-template void opttrans<double>(cl_kernel Kernel, const double* data_im, const int im_offset, const int channels,
+template void opttrans<double>(const double* data_im, const int im_offset, const int channels,
     const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
 
 template <typename Dtype>
@@ -883,7 +889,7 @@ template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double
 
 template <typename Dtype>
 void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-    std::string kernel_name = std::string("div") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
@@ -901,7 +907,7 @@ template void caffe_gpu_div<double> (const int n, const double* a, const double*
 
 template <typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
-     std::string kernel_name = std::string("add_scalar") + get_dtype_suffix<Dtype>();
+     std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
@@ -918,7 +924,7 @@ template void caffe_gpu_add_scalar<double> (const int n, const double alpha, dou
 
 template <typename Dtype>
 void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-        std::string kernel_name = std::string("element_mul") + get_dtype_suffix<Dtype>();
+        std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     cl_int ret;
@@ -937,7 +943,7 @@ template void caffe_gpu_mul<double> (const int n, const double* a, const double*
 
 template <typename Dtype>
 void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
-       std::string kernel_name = std::string("powx") + get_dtype_suffix<Dtype>();
+       std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
@@ -956,7 +962,7 @@ template void caffe_gpu_powx<double> (const int n, const double* a, const double
 template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
 {
-    std::string kernel_name = std::string("DropoutForward") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
     cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
     cl_int ret;
@@ -978,7 +984,7 @@ template void DropoutForward<double>(const int count, const double* bottom_data,
 template <typename Dtype>
 void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
 {
-    std::string kernel_name = std::string("DropoutBackward") + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
     cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
     cl_int ret;

From 77c1824894b3cf88b0a9769de09193306f8fff9e Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Mon, 10 Aug 2015 01:40:47 +0800
Subject: [PATCH 033/124] fixed some kernel name errors and re-organized the
 kernel files

---
 include/caffe/util/ocl_wrapper.hpp         |  10 +
 src/caffe/layers/prelu_layer.cpp           |  62 ++++-
 src/caffe/ocl/im2col.cl                    |  89 +++----
 src/caffe/ocl/prelu_layer.cl               |  32 +++
 src/caffe/ocl/{OCL_kernel.cl => random.cl} | 273 ---------------------
 src/caffe/ocl/softmax_layer.cl             |  97 ++++++++
 src/caffe/ocl/softmaxwithloss_layer.cl     |  14 +-
 src/caffe/ocl/util.cl                      | 136 ++++++++++
 src/caffe/util/im2col.cpp                  | 123 +---------
 src/caffe/util/ocl_wrapper.cpp             |  57 +++++
 10 files changed, 434 insertions(+), 459 deletions(-)
 create mode 100644 src/caffe/ocl/prelu_layer.cl
 rename src/caffe/ocl/{OCL_kernel.cl => random.cl} (59%)
 create mode 100644 src/caffe/ocl/util.cl

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index dbe2eb49..7109bfd1 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -67,6 +67,16 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
 template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
 
+
+template <typename Dtype>
+void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor);
+
+template <typename Dtype> 
+void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor);
+
+template <typename Dtype> 
+void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff);
+
 template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
 
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 4db0dc7c..ed51ac5e 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -132,14 +132,73 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+  const Dtype* slope_data = this->blobs_[0]->gpu_data();
+  const int div_factor = channel_shared_ ? channels : 1;
+  
+  if (top[0] == bottom[0]) {
+    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
+  }
+  PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor);
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-}
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
 
+  if (top[0] == bottom[0]) {
+    bottom_data = bottom_memory_.gpu_data();
+  }
 
+  // Propagate to param
+  // Since to write bottom diff will affect top diff if top and bottom blobs
+  // are identical (in-place computaion), we first compute param backward to
+  // keep top_diff unchanged.
+  if (this->param_propagate_down_[0]) {
+    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
+    int cdim = channels * dim;
+    Dtype dsum = 0.;
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      // compute element-wise diff
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      PReLUParamBackward(
+          cdim, top_diff + top[0]->offset(n),
+          bottom_data + bottom[0]->offset(n),
+          backward_buff_.mutable_gpu_diff());
+      if (channel_shared_) {
+        Dtype d;
+        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
+            multiplier_.gpu_data(), &d);
+        dsum += d;
+      } else {
+        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
+            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+            slope_diff);
+      }
+    }
+    if (channel_shared_) {
+      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+    }
+  }
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* slope_data = this->blobs_[0]->gpu_data();
+    int div_factor = channel_shared_ ? channels : 1;
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
+        div_factor);
+  }
+}
 
 #ifdef CPU_ONLY
 STUB_GPU(PReLULayer);
@@ -147,5 +206,4 @@ STUB_GPU(PReLULayer);
 
 INSTANTIATE_CLASS(PReLULayer);
 REGISTER_LAYER_CLASS(PReLU);
-
 }  // namespace caffe
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 728f8dd3..77367fa6 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -186,37 +186,6 @@ __kernel void col2im(const int n, __global T* data_col, const int col_offset, co
 template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); 
 template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); 
 
-template <class T>
-__kernel void im2col_yuan(const int n,__global T* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index;index<n;index+=tmp){
-        int w_out=index %width_col;
-        index /= width_col;
-        int h_out=index%height_col;
-        int channel_in = index/height_col;
-        int channel_out=channel_in *ksize *ksize;
-        int h_in = h_out *stride-pad;
-        int w_in = w_out *stride-pad;
-        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-        data_im +=(channel_in * height + h_in) *width + w_in;
-        int i=0,j=0;
-        for(i=0;i<ksize;++i){
-            for(j=0;j<ksize;++j){
-                int h = h_in+i;
-                int w = w_in+j;
-                if(h >= 0 && w >= 0 && h < height && w < width)
-                    *data_col=data_im[i * width + j];
-                else *data_col=0;
-                data_col += height_col *width_col;
-            }
-        }
-    }
-}
-
-template __attribute__((mangled_name(im2colfloat_yuan))) __kernel void im2col_yuan(const int n,__global float* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col); 
-template __attribute__((mangled_name(im2coldouble_yuan))) __kernel void im2col_yuan(const int n,__global double* data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col); 
-
 template <class T>
 __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){
     int index = get_global_id(0);
@@ -248,36 +217,6 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset
 template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
 template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
 
-
-template <class T>
-__kernel void col2im_yuan(const int n,__global T* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < n; index += tmp){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height);
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
-}
-template __attribute__((mangled_name(col2imfloat_yuan))) __kernel void col2im_yuan(const int n,__global float* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im); 
-template __attribute__((mangled_name(col2imdouble_yuan))) __kernel void col2im_yuan(const int n,__global double* data_col, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im); 
-
 template <class T>
 __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){
 
@@ -296,3 +235,31 @@ __kernel void opttrans(const int n, __global T* data_im, const int im_offset, co
 }
 template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
 template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
+
+template <class T>
+__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int gidy = get_global_id(1);
+     int gidyy = gidy;
+     int index = gidy / height;
+     int offset = index * width * height;
+     gidy = gidy % height;
+     if( gidx < width && gidyy < height * optnum )
+         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+}
+template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
+template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
+
+template <class T>
+__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
+     int gidx = get_global_id(0);
+     int index;
+     index = (optnum==1) ? 0: gidx % optnum;
+     dst = dst + top_offset; // now we point at (*top)[n]
+     int offset = gidx / optnum;
+     int i = 0;
+     for(i = 0 ; i < width; i++)
+         dst[(index * height + offset)* width + i] = src[gidx * width + i];
+}
+template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
new file mode 100644
index 00000000..83724d1a
--- /dev/null
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -0,0 +1,32 @@
+template <class T>
+__kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) {
+  int index = get_global_id(0);
+  if(index < count){
+    int c = (index / dim) % channels / div_factor;
+    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+  }
+}
+template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor);
+template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor);
+
+template <class T>
+__kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) {
+  int index = get_global_id(0);
+  if(index < count){
+    int c = (index / dim) % channels / div_factor;
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
+        + (in_data[index] <= 0) * slope_data[c]);
+  }
+}
+template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor);
+template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
+
+template <class T>
+__kernel void PReLUParamBackward(const int count, __global T* in_diff, __global T* in_data, __global T* out_diff) {
+  int index = get_global_id(0);
+  if(index < count){
+    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+  }
+}
+template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/OCL_kernel.cl b/src/caffe/ocl/random.cl
similarity index 59%
rename from src/caffe/ocl/OCL_kernel.cl
rename to src/caffe/ocl/random.cl
index a014a5cf..4980f8d2 100644
--- a/src/caffe/ocl/OCL_kernel.cl
+++ b/src/caffe/ocl/random.cl
@@ -724,276 +724,3 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t
 
 //end of the looooooong gpu_random_generator kernel 
 
-
-template <class T>
-__kernel void OCL_memset(__global T* buffer, const T value, const int size){
-	int gdx = get_global_id(0);
-	if(gdx < size){
-		buffer[gdx] = value;	
-	}
-}
-
-template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
-template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
-
-__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
-        int gdx = get_global_id(0);
-        if(gdx < size){
-                buffer[gdx] = value;    
-        }
-}
-
-template <class T>
-__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
-     int gdx = get_global_id(0);
-     if(gdx < N){
-          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
-     }
-}
-
-template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
-template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
-
-
-template <class T>
-__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
-     int index = get_global_id(0);
-     if (index < num) {
-	T maxval = -FLT_MAX;
-        for (int i = 0; i <  dim; i++)
-	maxval = max( data[index*dim + i], maxval );
-        out[index] = maxval;
-      }
-}
-
-template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
-template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
-
-template <class T>
-__kernel void exp (const int num, __global T* data, __global T* out){
-        int index = get_global_id(0);
-        if (index < num) 
-        out[index] = exp(data[index]);
-}
-
-template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
-template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
-
-
-
-template <class T>
-__kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* out) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* out);
-template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global  double* out);
-
-template <class T>
-__kernel void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
-    int index = get_global_id(0);
-    if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
-template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
-
-template <class T>
-__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
-template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
-
-template <class T>
-__kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* channel_sum) {
-  int index = get_global_id(0);
-   if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* channel_sum);
-template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global double* channel_sum);
-
-template <class T>
-__kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_sum, __global T* data) {
-    int index = get_global_id(0);
-   if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const float* channel_sum, __global float* data);
-template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const double* channel_sum, __global double* data);
-
-template <class T>
-__kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const T* data_1, __global const T* data_2,
-    __global T* channel_dot) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-        int n = index / spatial_dim;
-        int s = index % spatial_dim;
-        T dot = 0;
-        for (int c = 0; c < channels; ++c) {
-            dot += (data_1[(n * channels + c) * spatial_dim + s]
-                 * data_2[(n * channels + c) * spatial_dim + s]);
-        }
-        channel_dot[index] = dot;
-    }
-}
-
-template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const float* data_1, __global const float* data_2,
-    __global float* channel_dot);
-template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const double* data_1, __global const double* data_2,
-    __global double* channel_dot);
-
-
-
-template <class T>
-__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        int offset;
-	for(index; index < num; index +=  total){
-  	offset = (int) label[index];
-        data[index * dim + offset] -= 1;
-        }
-}
-
-template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
-template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
-
-template <class T>
-__kernel void scal (const int num, const T alpha, __global T* data){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        for(index; index < num; index +=  total){
-        data[index] = data[index] * alpha;
-        }
-}
-
-template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
-template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
-
-template <class T>
-__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
-	int index = get_global_id(0);
-        if (index < n)
-        y[index] = a[index] / b[index];
-}
-
-template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
-//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
-
-template <class T>
-__kernel void add_scalar (const int n, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] += alpha;
-}
-
-template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
-template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
-
-template <typename Dtype>
-__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] = in1[index] + in2[index] ;
-}
-template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
-template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
-
-template <class T>
-__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
-        int index = get_global_id(0);
-       if (index < n)
-        y[index] = a[index] * b[index];
-}
-
-template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
-template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
-
-
-template <class T>
-__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
-//           y[index] = a[index] + alpha;
-           y[index] = pow(a[index], alpha);
-}
-
-template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
-template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
-
-
-template <class T>
-__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int gidy = get_global_id(1);
-     int gidyy = gidy;
-     int index = gidy / height;
-     int offset = index * width * height;
-     gidy = gidy % height;
-     if( gidx < width && gidyy < height * optnum )
-         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
-}
-template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
-template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
-
-template <class T>
-__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int index;
-     index = (optnum==1) ? 0: gidx % optnum;
-     dst = dst + top_offset; // now we point at (*top)[n]
-     int offset = gidx / optnum;
-     int i = 0;
-     for(i = 0 ; i < width; i++)
-         dst[(index * height + offset)* width + i] = src[gidx * width + i];
-}
-template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
-template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
index 711e4334..ef1255a4 100644
--- a/src/caffe/ocl/softmax_layer.cl
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -46,3 +46,100 @@ __kernel void softmax_div (const int num, const int dim, __global T* scale, __gl
 
 template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
 template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global  double* out);
+
+template <class T>
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+    int index = get_global_id(0);
+    if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s]; 
+  }
+}
+template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+   if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* channel_sum);
+template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global double* channel_sum);
+
+template <class T>
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+    int index = get_global_id(0);
+   if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const float* channel_sum, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const double* channel_sum, __global double* data);
+                                                                                         
+template <class T>
+__kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const T* data_1, __global const T* data_2,
+    __global T* channel_dot) {
+    int index = get_global_id(0);
+    if(index < num * spatial_dim) {
+        int n = index / spatial_dim;
+        int s = index % spatial_dim;
+        T dot = 0;
+        for (int c = 0; c < channels; ++c) {
+            dot += (data_1[(n * channels + c) * spatial_dim + s]
+                 * data_2[(n * channels + c) * spatial_dim + s]);
+        }   
+        channel_dot[index] = dot;
+    }   
+}
+
+template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const float* data_1, __global const float* data_2,
+    __global float* channel_dot);
+template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const double* data_1, __global const double* data_2,
+    __global double* channel_dot);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index 97eb6874..cec6346b 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -59,7 +59,19 @@ template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel v
           int spatial_dim, bool has_ignore_label_,
           int ignore_label_, float* counts);
 
-template __attribute__ ((mangled_name(SoftmaxLossBackward_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
           __global double* label,__global double* bottom_diff, int num, int dim,
           int spatial_dim, bool has_ignore_label_,
           int ignore_label_, double* counts);
+
+template <class T>
+__kernel void scal (const int num, const T alpha, __global T* data){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        for(index; index < num; index +=  total){
+        data[index] = data[index] * alpha;
+        }
+}
+
+template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
+template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
new file mode 100644
index 00000000..eb01fae9
--- /dev/null
+++ b/src/caffe/ocl/util.cl
@@ -0,0 +1,136 @@
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+template <class T>
+__kernel void OCL_memset(__global T* buffer, const T value, const int size){
+	int gdx = get_global_id(0);
+	if(gdx < size){
+		buffer[gdx] = value;	
+	}
+}
+
+template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
+template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
+
+__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
+        int gdx = get_global_id(0);
+        if(gdx < size){
+                buffer[gdx] = value;    
+        }
+}
+
+template <class T>
+__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
+     int gdx = get_global_id(0);
+     if(gdx < N){
+          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
+     }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
+
+
+template <class T>
+__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
+     int index = get_global_id(0);
+     if (index < num) {
+	T maxval = -FLT_MAX;
+        for (int i = 0; i <  dim; i++)
+	maxval = max( data[index*dim + i], maxval );
+        out[index] = maxval;
+      }
+}
+
+template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
+
+template <class T>
+__kernel void exp (const int num, __global T* data, __global T* out){
+        int index = get_global_id(0);
+        if (index < num) 
+        out[index] = exp(data[index]);
+}
+
+template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
+
+
+template <class T>
+__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = exp(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
+
+
+template <class T>
+__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
+        int index = get_global_id(0);
+        int total = get_global_size(0);
+        int offset;
+	for(index; index < num; index +=  total){
+  	offset = (int) label[index];
+        data[index * dim + offset] -= 1;
+        }
+}
+
+template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
+template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
+
+
+template <class T>
+__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
+	int index = get_global_id(0);
+        if (index < n)
+        y[index] = a[index] / b[index];
+}
+
+template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
+//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
+
+template <class T>
+__kernel void add_scalar (const int n, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] += alpha;
+}
+
+template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
+
+template <typename Dtype>
+__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+        int index = get_global_id(0);
+        if (index < n)
+        y[index] = in1[index] + in2[index] ;
+}
+template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
+template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
+
+template <class T>
+__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
+        int index = get_global_id(0);
+       if (index < n)
+        y[index] = a[index] * b[index];
+}
+
+template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
+template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
+
+
+template <class T>
+__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
+        int index = get_global_id(0);
+        if (index < n)
+//           y[index] = a[index] + alpha;
+           y[index] = pow(a[index], alpha);
+}
+
+template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
+template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
+
+
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index a5eb4176..e75c0d9a 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -257,43 +257,6 @@ template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im, const
     const int height, const int width, const int ksize, const int pad,
     const int stride, double* data_col, const int col_offset);
 
-template <typename Dtype>
-void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset) {
-
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = 16 * channels * height_col * width_col;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {num_kernels};
-    size_t uiLocal_Work_Size[] = {256 - 256 % width_col};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-
-template void im2col_16_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col, const int col_offset);
-template void im2col_16_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col, const int col_offset);
-
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
@@ -339,7 +302,7 @@ template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
     const int height, const int width, const int ksize, const int pad,
     const int stride, Dtype* data_im, const int img_offset) {
-    std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+    std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     int height_col = (height + 2 * pad - ksize) / stride + 1;
@@ -378,89 +341,5 @@ template void col2im_gpu<double>(const double* data_col, const int col_offset, c
     const int height, const int width, const int psize, const int pad,
     const int stride, double* data_im, const int img_offset);
 
-template <typename Dtype>
-void im2col_gpu_ocl(cl_mem data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, cl_kernel Kernel) {
-
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = channels * height_col * width_col;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&width_col);
-    OCL_CHECK( clSetKernelArg(Kernel,9,sizeof(cl_mem),(void*)&data_col) );
-
-    if(ret!=CL_SUCCESS){
-        fprintf(stderr,"Failed to Set Args\n");
-    }
-
-    size_t uiGlobal_Work_Size[] = {num_kernels};
-    size_t uiLocal_Work_Size[] = {64};
-    cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL);
-    if(CL_SUCCESS!=iStatus){
-        fprintf(stderr,"Failed to enqueue kernel\n");
-    }
-}
-
-template void im2col_gpu_ocl<float>(cl_mem data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col, cl_kernel Kernel);
-template void im2col_gpu_ocl<double>(cl_mem data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col, cl_kernel Kernel);
-
-template <typename Dtype>
-void col2im_gpu_ocl(cl_mem data_col, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, cl_kernel Kernel) {
-
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operatiors)
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_im);
-
-    if(ret!=CL_SUCCESS){
-        fprintf(stderr,"Failed to Set Args\n");
-    }
-
-    size_t uiGlobal_Work_Size[] = {num_kernels};
-    size_t uiLocal_Work_Size[] = {64};
-    cl_int iStatus = clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL);
-    if(CL_SUCCESS!=iStatus){
-        fprintf(stderr,"Failed to enqueue kernel\n");
-    }
-}
-
-
-template void col2im_gpu_ocl<float>(cl_mem data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, float* data_im, cl_kernel Kernel);
-template void col2im_gpu_ocl<double>(cl_mem data_col, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, double* data_im, cl_kernel Kernel);
 
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index ac1d9958..1bdd4320 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -697,6 +697,63 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c
 template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff);
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
 
+
+template <typename Dtype> 
+void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor){
+    std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor);
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUForward<float>(const int count, const int channels, const int dim,const float* bottom_data, float* top_data, const float* slope_data, const int div_factor);
+template void PReLUForward<double>(const int count, const int channels, const int dim,const double* bottom_data, double* top_data, const double* slope_data, const int div_factor);
+
+template <typename Dtype> 
+void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor){
+    std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff);
+    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data);
+    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor);
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUBackward<float>(const int count, const int channels, const int dim, const float* top_diff, const float* bottom_data, float* bottom_diff, const float* slope_data, const int div_factor);
+template void PReLUBackward<double>(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor);
+
+template <typename Dtype> 
+void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){
+    std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    size_t Global_Work_Size[] = {count * 1};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUParamBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff);
+template void PReLUParamBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff);
+
+
 template <typename Dtype> 
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
     std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();

From cdd4d9debaf6377bcf9dc421f8afab6d032a164b Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Mon, 10 Aug 2015 15:38:03 +0800
Subject: [PATCH 034/124] add AMD's license

---
 include/caffe/device.hpp               | 26 ++++++++++++++++++++++++++
 include/caffe/syncedmem.hpp            | 26 ++++++++++++++++++++++++++
 include/caffe/util/im2col.hpp          | 26 ++++++++++++++++++++++++++
 include/caffe/util/math_functions.hpp  | 26 +++++++++++++++++++++++++-
 include/caffe/util/ocl_util.hpp        | 26 +++++++++++++++++++++++++-
 include/caffe/util/ocl_wrapper.hpp     | 26 +++++++++++++++++++++++++-
 src/caffe/device.cpp                   | 26 ++++++++++++++++++++++++++
 src/caffe/layers/conv_layer.cpp        |  1 +
 src/caffe/ocl/dropout_layer.cl         | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/im2col.cl                | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/lrn_layer.cl             | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/pooling_layer.cl         | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/prelu_layer.cl           | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/random.cl                | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/relu_layer.cl            | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/softmax_layer.cl         | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/softmaxwithloss_layer.cl | 26 ++++++++++++++++++++++++++
 src/caffe/ocl/util.cl                  | 26 ++++++++++++++++++++++++++
 src/caffe/syncedmem.cpp                | 26 ++++++++++++++++++++++++++
 src/caffe/util/im2col.cpp              | 26 ++++++++++++++++++++++++++
 src/caffe/util/math_functions.cpp      | 26 +++++++++++++++++++++++++-
 src/caffe/util/ocl_util.cpp            | 26 +++++++++++++++++++++++++-
 src/caffe/util/ocl_wrapper.cpp         | 26 +++++++++++++++++++++++++-
 23 files changed, 567 insertions(+), 6 deletions(-)

diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index cea343e8..6561ec48 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef CAFFE_DEVICE_HPP
 #define CAFFE_DEVICE_HPP
 #include <CL/cl.h>
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 2cb316fb..0fe6546d 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef CAFFE_SYNCEDMEM_HPP_
 #define CAFFE_SYNCEDMEM_HPP_
 
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index aec9e330..ba9c4aca 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef _CAFFE_UTIL_IM2COL_HPP_
 #define _CAFFE_UTIL_IM2COL_HPP_
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index a5ca6470..1dae00e0 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -1,4 +1,28 @@
-// Copyright 2014 BVLC and contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 25747702..2e56101e 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -1,4 +1,28 @@
-// Copyright 2014 AMD DNN contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #ifndef _CAFFE_UTIL_OCL_UTIL_HPP_
 #define _CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 7109bfd1..7351f8bc 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -1,4 +1,28 @@
-// Copyright 2014 AMD DNN contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_
 #define _CAFFE_UTIL_OCL_WRAPPER_HPP_
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 23c3789b..3ce6cefe 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include "caffe/common.hpp"
 #include "caffe/device.hpp"
 #include <stdio.h>
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 48b7afe9..855c00e1 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -80,6 +80,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    Forward_gpu_opt(bottom, top);
   else
    Forward_gpu_org(bottom, top);
+ CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
index 8d3db447..4bfa39bc 100644
--- a/src/caffe/ocl/dropout_layer.cl
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){
     int index = get_global_id(0);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 77367fa6..3e535d5f 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
     int index=get_global_id(0);
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
index 901b5b13..ae1c9269 100644
--- a/src/caffe/ocl/lrn_layer.cl
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
   int index = get_global_id(0);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 80289b68..d94efcba 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
      int index = get_global_id(0);
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index 83724d1a..be85a2e4 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) {
   int index = get_global_id(0);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index 4980f8d2..f5a7a4db 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
 //beginning of the looooooong gpu_random_generator kernel 
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index df26d66e..d3b36a34 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
 	int index = get_global_id(0);
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
index ef1255a4..6b225283 100644
--- a/src/caffe/ocl/softmax_layer.cl
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){
     
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index cec6346b..9dbe284f 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void SoftmaxLossForwardGPU(const int nthreads,
           __global T* prob_data, __global T* label,__global T* loss,
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index eb01fae9..55026603 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
 template <class T>
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index ac1187b9..123b0053 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include <cstring>
 
 #include "caffe/common.hpp"
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index e75c0d9a..29c6c1f9 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 54e0abdc..677afcdf 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,4 +1,28 @@
-// Copyright 2014 BVLC and contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 044f9e69..01c04711 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -1,4 +1,28 @@
-// Copyright 2014 AMD DNN contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #include <cmath>
 #include <cstdlib>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 1bdd4320..a9abda2e 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1,4 +1,28 @@
-// Copyright 2014 AMD DNN contributors.
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
 
 #include <cmath>
 #include <cstdlib>

From ed958d8e77c32e71af607daed2e02a25aa61684e Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 25 Aug 2015 13:53:08 +0800
Subject: [PATCH 035/124] This is a test layer

---
 include/caffe/common.hpp                      |    7 +-
 include/caffe/device.hpp                      |    6 +-
 include/caffe/util/math_functions.hpp         |   15 +
 include/caffe/util/ocl_wrapper.hpp            |   14 +
 .../solver_without_dropout.prototxt           |   14 +
 .../train_val_without_dropout.prototxt        |  366 +++
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/caffe.dir/DependInfo.cmake     |  108 +
 src/caffe/CMakeFiles/caffe.dir/build.make     | 2542 +++++++++++++++++
 .../CMakeFiles/caffe.dir/cmake_clean.cmake    |  126 +
 src/caffe/CMakeFiles/caffe.dir/depend.make    |    2 +
 src/caffe/CMakeFiles/caffe.dir/flags.make     |    8 +
 src/caffe/CMakeFiles/caffe.dir/link.txt       |    1 +
 src/caffe/CMakeFiles/caffe.dir/progress.make  |  118 +
 ..._compile_generated_absval_layer.cu.o.cmake |  296 ++
 ...compile_generated_absval_layer.cu.o.depend |    1 +
 ...mpile_generated_base_data_layer.cu.o.cmake |  296 ++
 ...pile_generated_base_data_layer.cu.o.depend |    1 +
 ...da_compile_generated_bnll_layer.cu.o.cmake |  296 ++
 ...a_compile_generated_bnll_layer.cu.o.depend |    1 +
 ..._compile_generated_concat_layer.cu.o.cmake |  296 ++
 ...compile_generated_concat_layer.cu.o.depend |    1 +
 ...enerated_contrastive_loss_layer.cu.o.cmake |  296 ++
 ...nerated_contrastive_loss_layer.cu.o.depend |    1 +
 ...da_compile_generated_conv_layer.cu.o.cmake |  296 ++
 ...a_compile_generated_conv_layer.cu.o.depend |    1 +
 ...pile_generated_cudnn_conv_layer.cu.o.cmake |  296 ++
 ...ile_generated_cudnn_conv_layer.cu.o.depend |    1 +
 ...e_generated_cudnn_pooling_layer.cu.o.cmake |  296 ++
 ..._generated_cudnn_pooling_layer.cu.o.depend |    1 +
 ...pile_generated_cudnn_relu_layer.cu.o.cmake |  296 ++
 ...ile_generated_cudnn_relu_layer.cu.o.depend |    1 +
 ...e_generated_cudnn_sigmoid_layer.cu.o.cmake |  296 ++
 ..._generated_cudnn_sigmoid_layer.cu.o.depend |    1 +
 ...e_generated_cudnn_softmax_layer.cu.o.cmake |  296 ++
 ..._generated_cudnn_softmax_layer.cu.o.depend |    1 +
 ...pile_generated_cudnn_tanh_layer.cu.o.cmake |  296 ++
 ...ile_generated_cudnn_tanh_layer.cu.o.depend |    1 +
 ..._compile_generated_deconv_layer.cu.o.cmake |  296 ++
 ...compile_generated_deconv_layer.cu.o.depend |    1 +
 ...compile_generated_dropout_layer.cu.o.cmake |  296 ++
 ...ompile_generated_dropout_layer.cu.o.depend |    1 +
 ...compile_generated_eltwise_layer.cu.o.cmake |  296 ++
 ...ompile_generated_eltwise_layer.cu.o.depend |    1 +
 ..._generated_euclidean_loss_layer.cu.o.cmake |  296 ++
 ...generated_euclidean_loss_layer.cu.o.depend |    1 +
 ...uda_compile_generated_exp_layer.cu.o.cmake |  296 ++
 ...da_compile_generated_exp_layer.cu.o.depend |    1 +
 ..._compile_generated_filter_layer.cu.o.cmake |  296 ++
 ...compile_generated_filter_layer.cu.o.depend |    1 +
 ...mpile_generated_hdf5_data_layer.cu.o.cmake |  296 ++
 ...pile_generated_hdf5_data_layer.cu.o.depend |    1 +
 ...ile_generated_hdf5_output_layer.cu.o.cmake |  296 ++
 ...le_generated_hdf5_output_layer.cu.o.depend |    1 +
 ..._compile_generated_im2col_layer.cu.o.cmake |  296 ++
 ...compile_generated_im2col_layer.cu.o.depend |    1 +
 ...e_generated_inner_product_layer.cu.o.cmake |  296 ++
 ..._generated_inner_product_layer.cu.o.depend |    1 +
 ...uda_compile_generated_log_layer.cu.o.cmake |  296 ++
 ...da_compile_generated_log_layer.cu.o.depend |    1 +
 ...uda_compile_generated_lrn_layer.cu.o.cmake |  296 ++
 ...da_compile_generated_lrn_layer.cu.o.depend |    1 +
 ...uda_compile_generated_mvn_layer.cu.o.cmake |  296 ++
 ...da_compile_generated_mvn_layer.cu.o.depend |    1 +
 ...compile_generated_pooling_layer.cu.o.cmake |  296 ++
 ...ompile_generated_pooling_layer.cu.o.depend |    1 +
 ...a_compile_generated_power_layer.cu.o.cmake |  296 ++
 ..._compile_generated_power_layer.cu.o.depend |    1 +
 ...a_compile_generated_prelu_layer.cu.o.cmake |  296 ++
 ..._compile_generated_prelu_layer.cu.o.depend |    1 +
 ...mpile_generated_reduction_layer.cu.o.cmake |  296 ++
 ...pile_generated_reduction_layer.cu.o.depend |    1 +
 ...da_compile_generated_relu_layer.cu.o.cmake |  296 ++
 ...a_compile_generated_relu_layer.cu.o.depend |    1 +
 ...igmoid_cross_entropy_loss_layer.cu.o.cmake |  296 ++
 ...gmoid_cross_entropy_loss_layer.cu.o.depend |  470 +++
 ...compile_generated_sigmoid_layer.cu.o.cmake |  296 ++
 ...ompile_generated_sigmoid_layer.cu.o.depend |  468 +++
 ...compile_generated_silence_layer.cu.o.cmake |  296 ++
 ...ompile_generated_silence_layer.cu.o.depend |    1 +
 ...a_compile_generated_slice_layer.cu.o.cmake |  296 ++
 ..._compile_generated_slice_layer.cu.o.depend |    1 +
 ...compile_generated_softmax_layer.cu.o.cmake |  296 ++
 ...ompile_generated_softmax_layer.cu.o.depend |    1 +
 ...le_generated_softmax_loss_layer.cu.o.cmake |  296 ++
 ...e_generated_softmax_loss_layer.cu.o.depend |    1 +
 ...a_compile_generated_split_layer.cu.o.cmake |  296 ++
 ..._compile_generated_split_layer.cu.o.depend |    1 +
 ...da_compile_generated_tanh_layer.cu.o.cmake |  296 ++
 ...a_compile_generated_tanh_layer.cu.o.depend |    1 +
 ...mpile_generated_threshold_layer.cu.o.cmake |  296 ++
 ...pile_generated_threshold_layer.cu.o.depend |    1 +
 .../cuda_compile_generated_im2col.cu.o.cmake  |  296 ++
 .../cuda_compile_generated_im2col.cu.o.depend |  404 +++
 ...ompile_generated_math_functions.cu.o.cmake |  296 ++
 ...mpile_generated_math_functions.cu.o.depend |  744 +++++
 src/caffe/CMakeFiles/progress.marks           |    1 +
 .../CMakeFiles/proto.dir/CXX.includecache     |   48 +
 .../CMakeFiles/proto.dir/DependInfo.cmake     |   39 +
 src/caffe/CMakeFiles/proto.dir/build.make     |  119 +
 .../CMakeFiles/proto.dir/cmake_clean.cmake    |   13 +
 .../proto.dir/cmake_clean_target.cmake        |    3 +
 .../CMakeFiles/proto.dir/depend.internal      |    6 +
 src/caffe/CMakeFiles/proto.dir/depend.make    |    6 +
 src/caffe/CMakeFiles/proto.dir/flags.make     |    8 +
 src/caffe/CMakeFiles/proto.dir/link.txt       |    2 +
 src/caffe/CMakeFiles/proto.dir/progress.make  |    3 +
 src/caffe/Makefile                            | 2279 +++++++++++++++
 src/caffe/cmake_install.cmake                 |   79 +
 src/caffe/common.cpp                          |    1 +
 src/caffe/device.cpp                          |   45 +-
 src/caffe/layers/conv_layer.cpp               |   20 +-
 src/caffe/layers/softmax_loss_layer.cpp       |    1 +
 src/caffe/net.cpp                             |   15 +-
 src/caffe/ocl/pooling_layer.cl                |    4 +-
 src/caffe/ocl/util.cl                         |    1 +
 src/caffe/solver.cpp                          |    4 +-
 .../CMakeDirectoryInformation.cmake           |   16 +
 ...le_generated_test_im2col_kernel.cu.o.cmake |  296 ++
 ...e_generated_test_im2col_kernel.cu.o.depend |    1 +
 src/caffe/test/CMakeFiles/progress.marks      |    1 +
 .../CMakeFiles/runtest.dir/DependInfo.cmake   |   27 +
 .../test/CMakeFiles/runtest.dir/build.make    |   69 +
 .../CMakeFiles/runtest.dir/cmake_clean.cmake  |    8 +
 .../test/CMakeFiles/runtest.dir/progress.make |    1 +
 .../test.testbin.dir/DependInfo.cmake         |   92 +
 .../CMakeFiles/test.testbin.dir/build.make    | 1623 +++++++++++
 .../test.testbin.dir/cmake_clean.cmake        |   68 +
 .../CMakeFiles/test.testbin.dir/depend.make   |    2 +
 .../CMakeFiles/test.testbin.dir/flags.make    |    8 +
 .../test/CMakeFiles/test.testbin.dir/link.txt |    1 +
 .../CMakeFiles/test.testbin.dir/progress.make |   60 +
 src/caffe/test/Makefile                       | 1766 ++++++++++++
 src/caffe/test/cmake_install.cmake            |   34 +
 src/caffe/test/test_caffe_main.cpp            |   12 +-
 src/caffe/util/benchmark.cpp                  |   47 +-
 src/caffe/util/math_functions.cpp             |   26 +
 src/caffe/util/ocl_util.cpp                   |    1 +
 src/caffe/util/ocl_wrapper.cpp                |   78 +-
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/gtest.dir/DependInfo.cmake     |   32 +
 src/gtest/CMakeFiles/gtest.dir/build.make     |  106 +
 .../CMakeFiles/gtest.dir/cmake_clean.cmake    |   10 +
 .../gtest.dir/cmake_clean_target.cmake        |    3 +
 src/gtest/CMakeFiles/gtest.dir/depend.make    |    2 +
 src/gtest/CMakeFiles/gtest.dir/flags.make     |    8 +
 src/gtest/CMakeFiles/gtest.dir/link.txt       |    2 +
 src/gtest/CMakeFiles/gtest.dir/progress.make  |    2 +
 src/gtest/CMakeFiles/progress.marks           |    1 +
 src/gtest/Makefile                            |  212 ++
 src/gtest/cmake_install.cmake                 |   34 +
 151 files changed, 24881 insertions(+), 83 deletions(-)
 create mode 100644 models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt
 create mode 100644 models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt
 create mode 100644 src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/build.make
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/depend.make
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/flags.make
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/link.txt
 create mode 100644 src/caffe/CMakeFiles/caffe.dir/progress.make
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
 create mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
 create mode 100644 src/caffe/CMakeFiles/progress.marks
 create mode 100644 src/caffe/CMakeFiles/proto.dir/CXX.includecache
 create mode 100644 src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
 create mode 100644 src/caffe/CMakeFiles/proto.dir/build.make
 create mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
 create mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
 create mode 100644 src/caffe/CMakeFiles/proto.dir/depend.internal
 create mode 100644 src/caffe/CMakeFiles/proto.dir/depend.make
 create mode 100644 src/caffe/CMakeFiles/proto.dir/flags.make
 create mode 100644 src/caffe/CMakeFiles/proto.dir/link.txt
 create mode 100644 src/caffe/CMakeFiles/proto.dir/progress.make
 create mode 100644 src/caffe/Makefile
 create mode 100644 src/caffe/cmake_install.cmake
 create mode 100644 src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
 create mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
 create mode 100644 src/caffe/test/CMakeFiles/progress.marks
 create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
 create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/build.make
 create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
 create mode 100644 src/caffe/test/CMakeFiles/runtest.dir/progress.make
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/build.make
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
 create mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
 create mode 100644 src/caffe/test/Makefile
 create mode 100644 src/caffe/test/cmake_install.cmake
 create mode 100644 src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/build.make
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/depend.make
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/flags.make
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/link.txt
 create mode 100644 src/gtest/CMakeFiles/gtest.dir/progress.make
 create mode 100644 src/gtest/CMakeFiles/progress.marks
 create mode 100644 src/gtest/Makefile
 create mode 100644 src/gtest/cmake_install.cmake

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 070513b5..4cd372a6 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -84,7 +84,7 @@ private:\
 #define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-#define global_packing_N 16
+#define global_packing_N 32
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ
@@ -231,7 +231,10 @@ class Caffe {
   // into the program since that may cause allocation of pinned memory being
   // freed in a non-pinned way, which may cause problems - I haven't verified
   // it personally but better to note it here in the header file.
-  inline static void set_mode(Brew mode) { Get().mode_ = mode; }
+  inline static void set_mode(Brew mode) { 
+    Get().mode_ = mode;
+    amdDevice.Init();
+  }
   // Sets the random seed of both boost and curand
   static void set_random_seed(const unsigned int seed);
   // Sets the device. Since we have cublas and curand stuff, set device also
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 6561ec48..31adcb5f 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -34,7 +34,7 @@ namespace caffe {
 
 class Device{
 public:
-    Device():numPlatforms(0),numDevices(0){}
+    Device():numPlatforms(0),numDevices(0){ }
     ~Device();
     cl_uint numPlatforms;
     cl_platform_id * platformIDs;
@@ -57,7 +57,7 @@ class Device{
     void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
 
     void GetDeviceInfo();
-    
+    void DeviceQuery();    
     void BuildProgram(std::string kernel_dir);    
 
     template <typename T>
@@ -66,7 +66,7 @@ class Device{
     void appendBitfield(T info, T value, std::string name, std::string &str);
    
     cl_kernel GetKernel(std::string kernel_name);    
-
+    void ReleaseKernels();
 };
 extern char* buildOption;
 extern Device amdDevice;
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 1dae00e0..381dd8fd 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -34,6 +34,7 @@
 #include "glog/logging.h"
 
 #include "caffe/util/mkl_alternate.hpp"
+#include "caffe/util/ocl_util.hpp"
 
 namespace caffe {
 
@@ -115,6 +116,20 @@ void caffe_set(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
 
+inline void caffe_memset(const size_t N, const int alpha, void* X) {
+  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
+}
+
+inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
+#ifndef CPU_ONLY
+  ocl_memset((int*)X, alpha, N);
+#else
+  NO_GPU;
+#endif
+}
+
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+
 template <typename Dtype>
 void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 7351f8bc..223e3278 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -33,6 +33,20 @@ typedef unsigned int uint32_t;
 //template <typename Dtype>
 //void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
+template <typename dtype> inline std::string get_dtype_suffix()
+{
+    dtype x;
+    const char type = typeid(x).name()[0];
+    std::string suffix;
+    switch(type){
+        case 'i': suffix = "_int"; break;
+        case 'd': suffix = "_double"; break;
+        case 'f':
+        default: suffix = "_float";
+    }
+    return suffix;
+}
+
 template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
 
diff --git a/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt
new file mode 100644
index 00000000..37b1d0d3
--- /dev/null
+++ b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt
@@ -0,0 +1,14 @@
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.1
+stepsize: 100000
+display: 1
+max_iter: 450000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
diff --git a/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt
new file mode 100644
index 00000000..f269ca0d
--- /dev/null
+++ b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt
@@ -0,0 +1,366 @@
+name: "AlexNet"
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 256
+    backend: LMDB
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 00000000..7bb0014c
--- /dev/null
+++ b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Relative path conversion top directories.
+SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+
+# Force unix paths in dependencies.
+SET(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
new file mode 100644
index 00000000..1678bc46
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
@@ -0,0 +1,108 @@
+# The set of languages for which implicit dependencies are needed:
+SET(CMAKE_DEPENDS_LANGUAGES
+  "CXX"
+  )
+# The set of files for implicit dependencies of each language:
+SET(CMAKE_DEPENDS_CHECK_CXX
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/blob.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/common.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/device.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/net.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/solver.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
+  )
+SET(CMAKE_CXX_COMPILER_ID "GNU")
+
+# Preprocessor definitions for this target.
+SET(CMAKE_TARGET_DEFINITIONS
+  "GTEST_USE_OWN_TR1_TUPLE"
+  )
+
+# Targets to which this target links.
+SET(CMAKE_TARGET_LINKED_INFO_FILES
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake"
+  )
+
+# The include file search paths:
+SET(CMAKE_C_TARGET_INCLUDE_PATH
+  "src"
+  "/usr/local/include"
+  "include"
+  "/usr/local/cuda/include"
+  "/usr/local/include/opencv"
+  "/usr/include/atlas"
+  "."
+  )
+SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/CMakeFiles/caffe.dir/build.make b/src/caffe/CMakeFiles/caffe.dir/build.make
new file mode 100644
index 00000000..916913ae
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/build.make
@@ -0,0 +1,2542 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# Include any dependencies generated for this target.
+include src/caffe/CMakeFiles/caffe.dir/depend.make
+
+# Include the progress variables for this target.
+include src/caffe/CMakeFiles/caffe.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include src/caffe/CMakeFiles/caffe.dir/flags.make
+
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/util/math_functions.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/util/im2col.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/layers/cufiles/sigmoid_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/layers/cufiles/bnll_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/layers/cufiles/conv_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/layers/cufiles/pooling_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/layers/cufiles/log_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/layers/cufiles/reduction_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/layers/cufiles/silence_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/layers/cufiles/power_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/layers/cufiles/split_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/layers/cufiles/absval_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/layers/cufiles/hdf5_output_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/layers/cufiles/base_data_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/layers/cufiles/dropout_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/layers/cufiles/cudnn_tanh_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/layers/cufiles/relu_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/layers/cufiles/cudnn_conv_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/layers/cufiles/contrastive_loss_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/layers/cufiles/concat_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/layers/cufiles/softmax_loss_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/layers/cufiles/cudnn_softmax_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/layers/cufiles/inner_product_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/layers/cufiles/filter_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/layers/cufiles/prelu_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/layers/cufiles/im2col_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/layers/cufiles/hdf5_data_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/layers/cufiles/deconv_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/layers/cufiles/mvn_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/layers/cufiles/tanh_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/layers/cufiles/slice_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/layers/cufiles/threshold_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/layers/cufiles/lrn_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/layers/cufiles/eltwise_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/layers/cufiles/exp_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/layers/cufiles/euclidean_loss_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/layers/cufiles/cudnn_relu_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/layers/cufiles/cudnn_pooling_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/layers/cufiles/softmax_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
+src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/common.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/common.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/common.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp > CMakeFiles/caffe.dir/common.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/common.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp -o CMakeFiles/caffe.dir/common.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/common.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/blob.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/blob.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/blob.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp > CMakeFiles/caffe.dir/blob.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/blob.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp -o CMakeFiles/caffe.dir/blob.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/util/ocl_wrapper.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp > CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/util/im2col.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/im2col.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/im2col.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp > CMakeFiles/caffe.dir/util/im2col.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/im2col.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp -o CMakeFiles/caffe.dir/util/im2col.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/util/upgrade_proto.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp > CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/util/db_leveldb.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_leveldb.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp > CMakeFiles/caffe.dir/util/db_leveldb.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_leveldb.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/util/ocl_util.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_util.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_util.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp > CMakeFiles/caffe.dir/util/ocl_util.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_util.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp -o CMakeFiles/caffe.dir/util/ocl_util.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/util/insert_splits.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/insert_splits.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/insert_splits.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp > CMakeFiles/caffe.dir/util/insert_splits.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/insert_splits.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp -o CMakeFiles/caffe.dir/util/insert_splits.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/util/db_lmdb.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_lmdb.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp > CMakeFiles/caffe.dir/util/db_lmdb.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_lmdb.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/util/math_functions.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/math_functions.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp > CMakeFiles/caffe.dir/util/math_functions.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/math_functions.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp -o CMakeFiles/caffe.dir/util/math_functions.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/util/io.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/io.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp > CMakeFiles/caffe.dir/util/io.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/io.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp -o CMakeFiles/caffe.dir/util/io.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/util/cudnn.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/cudnn.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/cudnn.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp > CMakeFiles/caffe.dir/util/cudnn.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/cudnn.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp -o CMakeFiles/caffe.dir/util/cudnn.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/util/db.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp > CMakeFiles/caffe.dir/util/db.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp -o CMakeFiles/caffe.dir/util/db.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/util/benchmark.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/benchmark.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp > CMakeFiles/caffe.dir/util/benchmark.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/benchmark.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp -o CMakeFiles/caffe.dir/util/benchmark.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/device.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/device.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/device.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/device.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp > CMakeFiles/caffe.dir/device.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/device.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp -o CMakeFiles/caffe.dir/device.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/device.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/internal_thread.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/internal_thread.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp > CMakeFiles/caffe.dir/internal_thread.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/internal_thread.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp -o CMakeFiles/caffe.dir/internal_thread.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/data_transformer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/data_transformer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp > CMakeFiles/caffe.dir/data_transformer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/data_transformer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp -o CMakeFiles/caffe.dir/data_transformer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/net.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/net.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/net.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp > CMakeFiles/caffe.dir/net.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/net.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp -o CMakeFiles/caffe.dir/net.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/net.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/solver.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_60)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/solver.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/solver.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp > CMakeFiles/caffe.dir/solver.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/solver.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp -o CMakeFiles/caffe.dir/solver.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/layer_factory.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_61)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layer_factory.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp > CMakeFiles/caffe.dir/layer_factory.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layer_factory.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp -o CMakeFiles/caffe.dir/layer_factory.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/syncedmem.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_62)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/syncedmem.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp > CMakeFiles/caffe.dir/syncedmem.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/syncedmem.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp -o CMakeFiles/caffe.dir/syncedmem.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/layers/deconv_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_63)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp > CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/layers/infogain_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_64)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp > CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/layers/log_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_65)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/log_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/log_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp > CMakeFiles/caffe.dir/layers/log_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/log_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp -o CMakeFiles/caffe.dir/layers/log_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/layers/base_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_66)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp > CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/layers/euclidean_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_67)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp > CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/layers/image_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_68)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp > CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/layers/sigmoid_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_69)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/layers/cudnn_softmax_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_70)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/layers/cudnn_tanh_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_71)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/layers/spp_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_72)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/spp_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp > CMakeFiles/caffe.dir/layers/spp_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/spp_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/layers/hdf5_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_73)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/layers/exp_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_74)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/exp_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp > CMakeFiles/caffe.dir/layers/exp_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/exp_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/layers/power_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_75)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/power_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp > CMakeFiles/caffe.dir/layers/power_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/power_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp -o CMakeFiles/caffe.dir/layers/power_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/layers/relu_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_76)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/relu_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp > CMakeFiles/caffe.dir/layers/relu_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/relu_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/layers/split_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_77)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/split_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp > CMakeFiles/caffe.dir/layers/split_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/split_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp -o CMakeFiles/caffe.dir/layers/split_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/layers/window_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_78)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp > CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/layers/dropout_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_79)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp > CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/layers/cudnn_sigmoid_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_80)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/layers/silence_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_81)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/silence_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp > CMakeFiles/caffe.dir/layers/silence_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/silence_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/layers/cudnn_pooling_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_82)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/layers/lrn_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_83)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp > CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/layers/memory_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_84)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp > CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/layers/mvn_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_85)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp > CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/layers/cudnn_relu_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_86)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/layers/slice_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_87)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/slice_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp > CMakeFiles/caffe.dir/layers/slice_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/slice_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/layers/pooling_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_88)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp > CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/layers/hdf5_output_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_89)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/layers/inner_product_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_90)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp > CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/layers/threshold_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_91)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp > CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/layers/reduction_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_92)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp > CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/layers/tanh_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_93)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp > CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/layers/prelu_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_94)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp > CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/layers/accuracy_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_95)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp > CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/layers/neuron_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_96)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp > CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/layers/absval_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_97)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/absval_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp > CMakeFiles/caffe.dir/layers/absval_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/absval_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/layers/loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_98)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp > CMakeFiles/caffe.dir/layers/loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/layers/softmax_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_99)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/layers/cudnn_conv_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_100)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_101)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/layers/concat_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_102)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/concat_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp > CMakeFiles/caffe.dir/layers/concat_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/concat_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/layers/hinge_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_103)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp > CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/layers/bnll_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_104)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp > CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/layers/flatten_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_105)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp > CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/layers/argmax_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_106)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp > CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/layers/filter_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_107)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/filter_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp > CMakeFiles/caffe.dir/layers/filter_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/filter_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/layers/dummy_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_108)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp > CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/layers/conv_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_109)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/conv_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp > CMakeFiles/caffe.dir/layers/conv_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/conv_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/layers/base_conv_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_110)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp > CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/layers/data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_111)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp > CMakeFiles/caffe.dir/layers/data_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp -o CMakeFiles/caffe.dir/layers/data_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/layers/softmax_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_112)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/layers/eltwise_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_113)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp > CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/layers/im2col_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_114)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp > CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/layers/multinomial_logistic_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_115)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp > CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/layers/contrastive_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_116)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp > CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/layers/reshape_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_117)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp > CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires:
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build
+.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides
+
+src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
+
+# Object files for target caffe
+caffe_OBJECTS = \
+"CMakeFiles/caffe.dir/common.cpp.o" \
+"CMakeFiles/caffe.dir/blob.cpp.o" \
+"CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" \
+"CMakeFiles/caffe.dir/util/im2col.cpp.o" \
+"CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" \
+"CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" \
+"CMakeFiles/caffe.dir/util/ocl_util.cpp.o" \
+"CMakeFiles/caffe.dir/util/insert_splits.cpp.o" \
+"CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" \
+"CMakeFiles/caffe.dir/util/math_functions.cpp.o" \
+"CMakeFiles/caffe.dir/util/io.cpp.o" \
+"CMakeFiles/caffe.dir/util/cudnn.cpp.o" \
+"CMakeFiles/caffe.dir/util/db.cpp.o" \
+"CMakeFiles/caffe.dir/util/benchmark.cpp.o" \
+"CMakeFiles/caffe.dir/device.cpp.o" \
+"CMakeFiles/caffe.dir/internal_thread.cpp.o" \
+"CMakeFiles/caffe.dir/data_transformer.cpp.o" \
+"CMakeFiles/caffe.dir/net.cpp.o" \
+"CMakeFiles/caffe.dir/solver.cpp.o" \
+"CMakeFiles/caffe.dir/layer_factory.cpp.o" \
+"CMakeFiles/caffe.dir/syncedmem.cpp.o" \
+"CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/log_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/power_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/split_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/data_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" \
+"CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
+
+# External object files for target caffe
+caffe_EXTERNAL_OBJECTS = \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
+
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/common.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/device.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/net.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/build.make
+lib/libcaffe.so: lib/libproto.a
+lib/libcaffe.so: lib/libproto.a
+lib/libcaffe.so: /usr/local/lib/libboost_system.so
+lib/libcaffe.so: /usr/local/lib/libboost_thread.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libpthread.so
+lib/libcaffe.so: /usr/local/lib/libglog.so
+lib/libcaffe.so: /usr/local/lib/libgflags.a
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so
+lib/libcaffe.so: /usr/local/lib/liblmdb.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so
+lib/libcaffe.so: /usr/lib/libsnappy.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so
+lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10
+lib/libcaffe.so: /usr/local/lib/libopencv_highgui.so.2.4.10
+lib/libcaffe.so: /usr/local/lib/libopencv_imgproc.so.2.4.10
+lib/libcaffe.so: /usr/lib/liblapack_atlas.so
+lib/libcaffe.so: /usr/lib/libcblas.so
+lib/libcaffe.so: /usr/lib/libatlas.so
+lib/libcaffe.so: /usr/local/lib/libglog.so
+lib/libcaffe.so: /usr/local/lib/libgflags.a
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so
+lib/libcaffe.so: /usr/local/lib/liblmdb.so
+lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so
+lib/libcaffe.so: /usr/lib/libsnappy.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so
+lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so
+lib/libcaffe.so: /usr/lib/liblapack_atlas.so
+lib/libcaffe.so: /usr/lib/libcblas.so
+lib/libcaffe.so: /usr/lib/libatlas.so
+lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10
+lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX shared library ../../lib/libcaffe.so"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/caffe.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+src/caffe/CMakeFiles/caffe.dir/build: lib/libcaffe.so
+.PHONY : src/caffe/CMakeFiles/caffe.dir/build
+
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
+src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
+.PHONY : src/caffe/CMakeFiles/caffe.dir/requires
+
+src/caffe/CMakeFiles/caffe.dir/clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/caffe.dir/cmake_clean.cmake
+.PHONY : src/caffe/CMakeFiles/caffe.dir/clean
+
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o
+src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/caffe/CMakeFiles/caffe.dir/depend
+
diff --git a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
new file mode 100644
index 00000000..344db002
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
@@ -0,0 +1,126 @@
+FILE(REMOVE_RECURSE
+  "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o"
+  "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o"
+  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
+  "CMakeFiles/caffe.dir/common.cpp.o"
+  "CMakeFiles/caffe.dir/blob.cpp.o"
+  "CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
+  "CMakeFiles/caffe.dir/util/im2col.cpp.o"
+  "CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
+  "CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
+  "CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
+  "CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
+  "CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
+  "CMakeFiles/caffe.dir/util/math_functions.cpp.o"
+  "CMakeFiles/caffe.dir/util/io.cpp.o"
+  "CMakeFiles/caffe.dir/util/cudnn.cpp.o"
+  "CMakeFiles/caffe.dir/util/db.cpp.o"
+  "CMakeFiles/caffe.dir/util/benchmark.cpp.o"
+  "CMakeFiles/caffe.dir/device.cpp.o"
+  "CMakeFiles/caffe.dir/internal_thread.cpp.o"
+  "CMakeFiles/caffe.dir/data_transformer.cpp.o"
+  "CMakeFiles/caffe.dir/net.cpp.o"
+  "CMakeFiles/caffe.dir/solver.cpp.o"
+  "CMakeFiles/caffe.dir/layer_factory.cpp.o"
+  "CMakeFiles/caffe.dir/syncedmem.cpp.o"
+  "CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
+  "CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
+  "../../lib/libcaffe.pdb"
+  "../../lib/libcaffe.so"
+)
+
+# Per-language clean rules from dependency scanning.
+FOREACH(lang CXX)
+  INCLUDE(CMakeFiles/caffe.dir/cmake_clean_${lang}.cmake OPTIONAL)
+ENDFOREACH(lang)
diff --git a/src/caffe/CMakeFiles/caffe.dir/depend.make b/src/caffe/CMakeFiles/caffe.dir/depend.make
new file mode 100644
index 00000000..0b20d16b
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for caffe.
+# This may be replaced when dependencies are built.
diff --git a/src/caffe/CMakeFiles/caffe.dir/flags.make b/src/caffe/CMakeFiles/caffe.dir/flags.make
new file mode 100644
index 00000000..494d36e8
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/flags.make
@@ -0,0 +1,8 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# compile CXX with /usr/bin/c++
+CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -fPIC -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
+
+CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE -Dcaffe_EXPORTS
+
diff --git a/src/caffe/CMakeFiles/caffe.dir/link.txt b/src/caffe/CMakeFiles/caffe.dir/link.txt
new file mode 100644
index 00000000..603d461f
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/link.txt
@@ -0,0 +1 @@
+/usr/bin/c++  -fPIC  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG   -shared -Wl,-soname,libcaffe.so -o ../../lib/libcaffe.so CMakeFiles/caffe.dir/common.cpp.o CMakeFiles/caffe.dir/blob.cpp.o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o CMakeFiles/caffe.dir/util/im2col.cpp.o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o CMakeFiles/caffe.dir/util/ocl_util.cpp.o CMakeFiles/caffe.dir/util/insert_splits.cpp.o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o CMakeFiles/caffe.dir/util/math_functions.cpp.o CMakeFiles/caffe.dir/util/io.cpp.o CMakeFiles/caffe.dir/util/cudnn.cpp.o CMakeFiles/caffe.dir/util/db.cpp.o CMakeFiles/caffe.dir/util/benchmark.cpp.o CMakeFiles/caffe.dir/device.cpp.o CMakeFiles/caffe.dir/internal_thread.cpp.o CMakeFiles/caffe.dir/data_transformer.cpp.o CMakeFiles/caffe.dir/net.cpp.o CMakeFiles/caffe.dir/solver.cpp.o CMakeFiles/caffe.dir/layer_factory.cpp.o CMakeFiles/caffe.dir/syncedmem.cpp.o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/log_layer.cpp.o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o CMakeFiles/caffe.dir/layers/power_layer.cpp.o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o CMakeFiles/caffe.dir/layers/split_layer.cpp.o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/data_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o  -L/usr/local/cuda/lib64  -L/usr/local/lib ../../lib/libproto.a ../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_core.so.2.4.10 /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 -llapack_atlas -lcblas -latlas /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so -llapack_atlas -lcblas -latlas /usr/local/lib/libopencv_core.so.2.4.10 -Wl,-rpath,/usr/local/cuda/lib64:/usr/local/lib::::::::::::::::::::::::::::::::::::::::::::::::::::::::: 
diff --git a/src/caffe/CMakeFiles/caffe.dir/progress.make b/src/caffe/CMakeFiles/caffe.dir/progress.make
new file mode 100644
index 00000000..d53ba6a8
--- /dev/null
+++ b/src/caffe/CMakeFiles/caffe.dir/progress.make
@@ -0,0 +1,118 @@
+CMAKE_PROGRESS_1 = 
+CMAKE_PROGRESS_2 = 1
+CMAKE_PROGRESS_3 = 
+CMAKE_PROGRESS_4 = 2
+CMAKE_PROGRESS_5 = 
+CMAKE_PROGRESS_6 = 3
+CMAKE_PROGRESS_7 = 
+CMAKE_PROGRESS_8 = 4
+CMAKE_PROGRESS_9 = 
+CMAKE_PROGRESS_10 = 5
+CMAKE_PROGRESS_11 = 
+CMAKE_PROGRESS_12 = 6
+CMAKE_PROGRESS_13 = 
+CMAKE_PROGRESS_14 = 7
+CMAKE_PROGRESS_15 = 
+CMAKE_PROGRESS_16 = 8
+CMAKE_PROGRESS_17 = 
+CMAKE_PROGRESS_18 = 9
+CMAKE_PROGRESS_19 = 
+CMAKE_PROGRESS_20 = 10
+CMAKE_PROGRESS_21 = 
+CMAKE_PROGRESS_22 = 11
+CMAKE_PROGRESS_23 = 
+CMAKE_PROGRESS_24 = 12
+CMAKE_PROGRESS_25 = 
+CMAKE_PROGRESS_26 = 13
+CMAKE_PROGRESS_27 = 
+CMAKE_PROGRESS_28 = 14
+CMAKE_PROGRESS_29 = 
+CMAKE_PROGRESS_30 = 15
+CMAKE_PROGRESS_31 = 
+CMAKE_PROGRESS_32 = 16
+CMAKE_PROGRESS_33 = 
+CMAKE_PROGRESS_34 = 17
+CMAKE_PROGRESS_35 = 
+CMAKE_PROGRESS_36 = 18
+CMAKE_PROGRESS_37 = 
+CMAKE_PROGRESS_38 = 19
+CMAKE_PROGRESS_39 = 
+CMAKE_PROGRESS_40 = 20
+CMAKE_PROGRESS_41 = 
+CMAKE_PROGRESS_42 = 21
+CMAKE_PROGRESS_43 = 
+CMAKE_PROGRESS_44 = 22
+CMAKE_PROGRESS_45 = 
+CMAKE_PROGRESS_46 = 23
+CMAKE_PROGRESS_47 = 
+CMAKE_PROGRESS_48 = 24
+CMAKE_PROGRESS_49 = 
+CMAKE_PROGRESS_50 = 25
+CMAKE_PROGRESS_51 = 
+CMAKE_PROGRESS_52 = 26
+CMAKE_PROGRESS_53 = 
+CMAKE_PROGRESS_54 = 27
+CMAKE_PROGRESS_55 = 
+CMAKE_PROGRESS_56 = 28
+CMAKE_PROGRESS_57 = 
+CMAKE_PROGRESS_58 = 29
+CMAKE_PROGRESS_59 = 
+CMAKE_PROGRESS_60 = 30
+CMAKE_PROGRESS_61 = 
+CMAKE_PROGRESS_62 = 31
+CMAKE_PROGRESS_63 = 
+CMAKE_PROGRESS_64 = 32
+CMAKE_PROGRESS_65 = 
+CMAKE_PROGRESS_66 = 33
+CMAKE_PROGRESS_67 = 34
+CMAKE_PROGRESS_68 = 
+CMAKE_PROGRESS_69 = 35
+CMAKE_PROGRESS_70 = 
+CMAKE_PROGRESS_71 = 36
+CMAKE_PROGRESS_72 = 
+CMAKE_PROGRESS_73 = 37
+CMAKE_PROGRESS_74 = 
+CMAKE_PROGRESS_75 = 38
+CMAKE_PROGRESS_76 = 
+CMAKE_PROGRESS_77 = 39
+CMAKE_PROGRESS_78 = 
+CMAKE_PROGRESS_79 = 40
+CMAKE_PROGRESS_80 = 
+CMAKE_PROGRESS_81 = 41
+CMAKE_PROGRESS_82 = 
+CMAKE_PROGRESS_83 = 42
+CMAKE_PROGRESS_84 = 
+CMAKE_PROGRESS_85 = 43
+CMAKE_PROGRESS_86 = 
+CMAKE_PROGRESS_87 = 44
+CMAKE_PROGRESS_88 = 
+CMAKE_PROGRESS_89 = 45
+CMAKE_PROGRESS_90 = 
+CMAKE_PROGRESS_91 = 46
+CMAKE_PROGRESS_92 = 
+CMAKE_PROGRESS_93 = 47
+CMAKE_PROGRESS_94 = 
+CMAKE_PROGRESS_95 = 48
+CMAKE_PROGRESS_96 = 
+CMAKE_PROGRESS_97 = 49
+CMAKE_PROGRESS_98 = 
+CMAKE_PROGRESS_99 = 50
+CMAKE_PROGRESS_100 = 
+CMAKE_PROGRESS_101 = 51
+CMAKE_PROGRESS_102 = 
+CMAKE_PROGRESS_103 = 52
+CMAKE_PROGRESS_104 = 
+CMAKE_PROGRESS_105 = 53
+CMAKE_PROGRESS_106 = 
+CMAKE_PROGRESS_107 = 54
+CMAKE_PROGRESS_108 = 
+CMAKE_PROGRESS_109 = 55
+CMAKE_PROGRESS_110 = 
+CMAKE_PROGRESS_111 = 56
+CMAKE_PROGRESS_112 = 
+CMAKE_PROGRESS_113 = 57
+CMAKE_PROGRESS_114 = 
+CMAKE_PROGRESS_115 = 58
+CMAKE_PROGRESS_116 = 
+CMAKE_PROGRESS_117 = 59
+
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
new file mode 100644
index 00000000..2b3197e9
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/absval_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
new file mode 100644
index 00000000..5558d70f
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/base_data_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
new file mode 100644
index 00000000..ae71cc72
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/bnll_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
new file mode 100644
index 00000000..48e8560a
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/concat_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
new file mode 100644
index 00000000..c5f6dca9
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/contrastive_loss_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
new file mode 100644
index 00000000..311ad242
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/conv_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
new file mode 100644
index 00000000..06210cf1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_conv_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
new file mode 100644
index 00000000..8f7960d4
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_pooling_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
new file mode 100644
index 00000000..308889ee
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_relu_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
new file mode 100644
index 00000000..d65ebd00
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
new file mode 100644
index 00000000..806067ce
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_softmax_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
new file mode 100644
index 00000000..7ace65eb
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_tanh_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
new file mode 100644
index 00000000..bc67ea5b
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/deconv_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
new file mode 100644
index 00000000..5ff06e9f
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/dropout_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
new file mode 100644
index 00000000..44e91898
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/eltwise_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
new file mode 100644
index 00000000..98ee3de7
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/euclidean_loss_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
new file mode 100644
index 00000000..2402999e
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/exp_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
new file mode 100644
index 00000000..83a032df
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/filter_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
new file mode 100644
index 00000000..a88ed54d
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_data_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
new file mode 100644
index 00000000..252b9dfd
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_output_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
new file mode 100644
index 00000000..6bda58ec
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/im2col_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
new file mode 100644
index 00000000..eac6680c
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/inner_product_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
new file mode 100644
index 00000000..d18371a0
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/log_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
new file mode 100644
index 00000000..c3c715f8
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/lrn_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
new file mode 100644
index 00000000..663f4478
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/mvn_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
new file mode 100644
index 00000000..866d0f93
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/pooling_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
new file mode 100644
index 00000000..c6c30190
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/power_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
new file mode 100644
index 00000000..c64cff0e
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/prelu_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
new file mode 100644
index 00000000..b926deab
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/reduction_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
new file mode 100644
index 00000000..27fda108
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/relu_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
new file mode 100644
index 00000000..63d7ac68
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
new file mode 100644
index 00000000..a7e2268a
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
@@ -0,0 +1,470 @@
+# Generated by: make2cmake.cmake
+SET(CUDA_NVCC_DEPEND
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu"
+ "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
+ "/opt/clBLAS-private-april8/include/clBLAS.h"
+ "/usr/include/H5ACpublic.h"
+ "/usr/include/H5Apublic.h"
+ "/usr/include/H5Cpublic.h"
+ "/usr/include/H5Dpublic.h"
+ "/usr/include/H5Epubgen.h"
+ "/usr/include/H5Epublic.h"
+ "/usr/include/H5FDcore.h"
+ "/usr/include/H5FDdirect.h"
+ "/usr/include/H5FDfamily.h"
+ "/usr/include/H5FDlog.h"
+ "/usr/include/H5FDmpi.h"
+ "/usr/include/H5FDmpio.h"
+ "/usr/include/H5FDmpiposix.h"
+ "/usr/include/H5FDmulti.h"
+ "/usr/include/H5FDpublic.h"
+ "/usr/include/H5FDsec2.h"
+ "/usr/include/H5FDstdio.h"
+ "/usr/include/H5Fpublic.h"
+ "/usr/include/H5Gpublic.h"
+ "/usr/include/H5Ipublic.h"
+ "/usr/include/H5Lpublic.h"
+ "/usr/include/H5MMpublic.h"
+ "/usr/include/H5Opublic.h"
+ "/usr/include/H5Ppublic.h"
+ "/usr/include/H5Rpublic.h"
+ "/usr/include/H5Spublic.h"
+ "/usr/include/H5Tpublic.h"
+ "/usr/include/H5Zpublic.h"
+ "/usr/include/H5api_adpt.h"
+ "/usr/include/H5pubconf.h"
+ "/usr/include/H5public.h"
+ "/usr/include/H5version.h"
+ "/usr/include/_G_config.h"
+ "/usr/include/alloca.h"
+ "/usr/include/asm-generic/errno-base.h"
+ "/usr/include/asm-generic/errno.h"
+ "/usr/include/assert.h"
+ "/usr/include/atlas/cblas.h"
+ "/usr/include/c++/4.8/algorithm"
+ "/usr/include/c++/4.8/backward/auto_ptr.h"
+ "/usr/include/c++/4.8/backward/binders.h"
+ "/usr/include/c++/4.8/bits/algorithmfwd.h"
+ "/usr/include/c++/4.8/bits/allocator.h"
+ "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
+ "/usr/include/c++/4.8/bits/basic_ios.h"
+ "/usr/include/c++/4.8/bits/basic_ios.tcc"
+ "/usr/include/c++/4.8/bits/basic_string.h"
+ "/usr/include/c++/4.8/bits/basic_string.tcc"
+ "/usr/include/c++/4.8/bits/char_traits.h"
+ "/usr/include/c++/4.8/bits/codecvt.h"
+ "/usr/include/c++/4.8/bits/concept_check.h"
+ "/usr/include/c++/4.8/bits/cpp_type_traits.h"
+ "/usr/include/c++/4.8/bits/cxxabi_forced.h"
+ "/usr/include/c++/4.8/bits/exception_defines.h"
+ "/usr/include/c++/4.8/bits/fstream.tcc"
+ "/usr/include/c++/4.8/bits/functexcept.h"
+ "/usr/include/c++/4.8/bits/ios_base.h"
+ "/usr/include/c++/4.8/bits/istream.tcc"
+ "/usr/include/c++/4.8/bits/locale_classes.h"
+ "/usr/include/c++/4.8/bits/locale_classes.tcc"
+ "/usr/include/c++/4.8/bits/locale_facets.h"
+ "/usr/include/c++/4.8/bits/locale_facets.tcc"
+ "/usr/include/c++/4.8/bits/localefwd.h"
+ "/usr/include/c++/4.8/bits/memoryfwd.h"
+ "/usr/include/c++/4.8/bits/move.h"
+ "/usr/include/c++/4.8/bits/ostream.tcc"
+ "/usr/include/c++/4.8/bits/ostream_insert.h"
+ "/usr/include/c++/4.8/bits/postypes.h"
+ "/usr/include/c++/4.8/bits/range_access.h"
+ "/usr/include/c++/4.8/bits/sstream.tcc"
+ "/usr/include/c++/4.8/bits/stl_algo.h"
+ "/usr/include/c++/4.8/bits/stl_algobase.h"
+ "/usr/include/c++/4.8/bits/stl_bvector.h"
+ "/usr/include/c++/4.8/bits/stl_construct.h"
+ "/usr/include/c++/4.8/bits/stl_function.h"
+ "/usr/include/c++/4.8/bits/stl_heap.h"
+ "/usr/include/c++/4.8/bits/stl_iterator.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
+ "/usr/include/c++/4.8/bits/stl_map.h"
+ "/usr/include/c++/4.8/bits/stl_multimap.h"
+ "/usr/include/c++/4.8/bits/stl_multiset.h"
+ "/usr/include/c++/4.8/bits/stl_pair.h"
+ "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
+ "/usr/include/c++/4.8/bits/stl_relops.h"
+ "/usr/include/c++/4.8/bits/stl_set.h"
+ "/usr/include/c++/4.8/bits/stl_tempbuf.h"
+ "/usr/include/c++/4.8/bits/stl_tree.h"
+ "/usr/include/c++/4.8/bits/stl_uninitialized.h"
+ "/usr/include/c++/4.8/bits/stl_vector.h"
+ "/usr/include/c++/4.8/bits/stream_iterator.h"
+ "/usr/include/c++/4.8/bits/streambuf.tcc"
+ "/usr/include/c++/4.8/bits/streambuf_iterator.h"
+ "/usr/include/c++/4.8/bits/stringfwd.h"
+ "/usr/include/c++/4.8/bits/vector.tcc"
+ "/usr/include/c++/4.8/cctype"
+ "/usr/include/c++/4.8/cfloat"
+ "/usr/include/c++/4.8/climits"
+ "/usr/include/c++/4.8/clocale"
+ "/usr/include/c++/4.8/cmath"
+ "/usr/include/c++/4.8/cstddef"
+ "/usr/include/c++/4.8/cstdio"
+ "/usr/include/c++/4.8/cstdlib"
+ "/usr/include/c++/4.8/cwchar"
+ "/usr/include/c++/4.8/cwctype"
+ "/usr/include/c++/4.8/cxxabi.h"
+ "/usr/include/c++/4.8/debug/debug.h"
+ "/usr/include/c++/4.8/exception"
+ "/usr/include/c++/4.8/ext/alloc_traits.h"
+ "/usr/include/c++/4.8/ext/atomicity.h"
+ "/usr/include/c++/4.8/ext/new_allocator.h"
+ "/usr/include/c++/4.8/ext/numeric_traits.h"
+ "/usr/include/c++/4.8/ext/type_traits.h"
+ "/usr/include/c++/4.8/fstream"
+ "/usr/include/c++/4.8/functional"
+ "/usr/include/c++/4.8/ios"
+ "/usr/include/c++/4.8/iosfwd"
+ "/usr/include/c++/4.8/iostream"
+ "/usr/include/c++/4.8/istream"
+ "/usr/include/c++/4.8/iterator"
+ "/usr/include/c++/4.8/map"
+ "/usr/include/c++/4.8/memory"
+ "/usr/include/c++/4.8/new"
+ "/usr/include/c++/4.8/ostream"
+ "/usr/include/c++/4.8/set"
+ "/usr/include/c++/4.8/sstream"
+ "/usr/include/c++/4.8/streambuf"
+ "/usr/include/c++/4.8/string"
+ "/usr/include/c++/4.8/typeinfo"
+ "/usr/include/c++/4.8/utility"
+ "/usr/include/c++/4.8/vector"
+ "/usr/include/ctype.h"
+ "/usr/include/endian.h"
+ "/usr/include/errno.h"
+ "/usr/include/features.h"
+ "/usr/include/getopt.h"
+ "/usr/include/google/protobuf/descriptor.h"
+ "/usr/include/google/protobuf/extension_set.h"
+ "/usr/include/google/protobuf/generated_enum_reflection.h"
+ "/usr/include/google/protobuf/generated_message_util.h"
+ "/usr/include/google/protobuf/message.h"
+ "/usr/include/google/protobuf/message_lite.h"
+ "/usr/include/google/protobuf/repeated_field.h"
+ "/usr/include/google/protobuf/stubs/common.h"
+ "/usr/include/google/protobuf/stubs/template_util.h"
+ "/usr/include/google/protobuf/stubs/type_traits.h"
+ "/usr/include/google/protobuf/unknown_field_set.h"
+ "/usr/include/hdf5.h"
+ "/usr/include/inttypes.h"
+ "/usr/include/libio.h"
+ "/usr/include/limits.h"
+ "/usr/include/linux/errno.h"
+ "/usr/include/linux/limits.h"
+ "/usr/include/locale.h"
+ "/usr/include/math.h"
+ "/usr/include/pthread.h"
+ "/usr/include/sched.h"
+ "/usr/include/stdc-predef.h"
+ "/usr/include/stdint.h"
+ "/usr/include/stdio.h"
+ "/usr/include/stdlib.h"
+ "/usr/include/string.h"
+ "/usr/include/time.h"
+ "/usr/include/unistd.h"
+ "/usr/include/wchar.h"
+ "/usr/include/wctype.h"
+ "/usr/include/x86_64-linux-gnu/asm/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
+ "/usr/include/x86_64-linux-gnu/bits/confname.h"
+ "/usr/include/x86_64-linux-gnu/bits/endian.h"
+ "/usr/include/x86_64-linux-gnu/bits/environments.h"
+ "/usr/include/x86_64-linux-gnu/bits/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
+ "/usr/include/x86_64-linux-gnu/bits/inf.h"
+ "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/locale.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
+ "/usr/include/x86_64-linux-gnu/bits/nan.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
+ "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
+ "/usr/include/x86_64-linux-gnu/bits/sched.h"
+ "/usr/include/x86_64-linux-gnu/bits/select.h"
+ "/usr/include/x86_64-linux-gnu/bits/select2.h"
+ "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
+ "/usr/include/x86_64-linux-gnu/bits/sigset.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
+ "/usr/include/x86_64-linux-gnu/bits/string3.h"
+ "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
+ "/usr/include/x86_64-linux-gnu/bits/time.h"
+ "/usr/include/x86_64-linux-gnu/bits/timex.h"
+ "/usr/include/x86_64-linux-gnu/bits/types.h"
+ "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
+ "/usr/include/x86_64-linux-gnu/bits/unistd.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
+ "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
+ "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
+ "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
+ "/usr/include/x86_64-linux-gnu/sys/select.h"
+ "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
+ "/usr/include/x86_64-linux-gnu/sys/types.h"
+ "/usr/include/xlocale.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/float.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
+ "/usr/local/cuda-6.5/include/CL/cl.h"
+ "/usr/local/cuda-6.5/include/CL/cl_ext.h"
+ "/usr/local/cuda-6.5/include/CL/cl_platform.h"
+ "/usr/local/cuda-6.5/include/builtin_types.h"
+ "/usr/local/cuda-6.5/include/channel_descriptor.h"
+ "/usr/local/cuda-6.5/include/common_functions.h"
+ "/usr/local/cuda-6.5/include/cuComplex.h"
+ "/usr/local/cuda-6.5/include/cublas_api.h"
+ "/usr/local/cuda-6.5/include/cublas_v2.h"
+ "/usr/local/cuda-6.5/include/cuda.h"
+ "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_surface_types.h"
+ "/usr/local/cuda-6.5/include/cuda_texture_types.h"
+ "/usr/local/cuda-6.5/include/curand.h"
+ "/usr/local/cuda-6.5/include/device_functions.h"
+ "/usr/local/cuda-6.5/include/device_launch_parameters.h"
+ "/usr/local/cuda-6.5/include/device_types.h"
+ "/usr/local/cuda-6.5/include/driver_functions.h"
+ "/usr/local/cuda-6.5/include/driver_types.h"
+ "/usr/local/cuda-6.5/include/host_config.h"
+ "/usr/local/cuda-6.5/include/host_defines.h"
+ "/usr/local/cuda-6.5/include/math_functions.h"
+ "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
+ "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
+ "/usr/local/cuda-6.5/include/surface_functions.h"
+ "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/surface_types.h"
+ "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
+ "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/texture_types.h"
+ "/usr/local/cuda-6.5/include/vector_functions.h"
+ "/usr/local/cuda-6.5/include/vector_types.h"
+ "/usr/local/include/boost/assert.hpp"
+ "/usr/local/include/boost/checked_delete.hpp"
+ "/usr/local/include/boost/config.hpp"
+ "/usr/local/include/boost/config/compiler/gcc.hpp"
+ "/usr/local/include/boost/config/compiler/nvcc.hpp"
+ "/usr/local/include/boost/config/no_tr1/memory.hpp"
+ "/usr/local/include/boost/config/no_tr1/utility.hpp"
+ "/usr/local/include/boost/config/platform/linux.hpp"
+ "/usr/local/include/boost/config/posix_features.hpp"
+ "/usr/local/include/boost/config/select_compiler_config.hpp"
+ "/usr/local/include/boost/config/select_platform_config.hpp"
+ "/usr/local/include/boost/config/select_stdlib_config.hpp"
+ "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
+ "/usr/local/include/boost/config/suffix.hpp"
+ "/usr/local/include/boost/config/user.hpp"
+ "/usr/local/include/boost/core/checked_delete.hpp"
+ "/usr/local/include/boost/core/demangle.hpp"
+ "/usr/local/include/boost/core/typeinfo.hpp"
+ "/usr/local/include/boost/current_function.hpp"
+ "/usr/local/include/boost/detail/sp_typeinfo.hpp"
+ "/usr/local/include/boost/detail/workaround.hpp"
+ "/usr/local/include/boost/exception/exception.hpp"
+ "/usr/local/include/boost/predef.h"
+ "/usr/local/include/boost/predef/architecture.h"
+ "/usr/local/include/boost/predef/architecture/alpha.h"
+ "/usr/local/include/boost/predef/architecture/arm.h"
+ "/usr/local/include/boost/predef/architecture/blackfin.h"
+ "/usr/local/include/boost/predef/architecture/convex.h"
+ "/usr/local/include/boost/predef/architecture/ia64.h"
+ "/usr/local/include/boost/predef/architecture/m68k.h"
+ "/usr/local/include/boost/predef/architecture/mips.h"
+ "/usr/local/include/boost/predef/architecture/parisc.h"
+ "/usr/local/include/boost/predef/architecture/ppc.h"
+ "/usr/local/include/boost/predef/architecture/pyramid.h"
+ "/usr/local/include/boost/predef/architecture/rs6k.h"
+ "/usr/local/include/boost/predef/architecture/sparc.h"
+ "/usr/local/include/boost/predef/architecture/superh.h"
+ "/usr/local/include/boost/predef/architecture/sys370.h"
+ "/usr/local/include/boost/predef/architecture/sys390.h"
+ "/usr/local/include/boost/predef/architecture/x86.h"
+ "/usr/local/include/boost/predef/architecture/x86/32.h"
+ "/usr/local/include/boost/predef/architecture/x86/64.h"
+ "/usr/local/include/boost/predef/architecture/z.h"
+ "/usr/local/include/boost/predef/compiler.h"
+ "/usr/local/include/boost/predef/compiler/borland.h"
+ "/usr/local/include/boost/predef/compiler/clang.h"
+ "/usr/local/include/boost/predef/compiler/comeau.h"
+ "/usr/local/include/boost/predef/compiler/compaq.h"
+ "/usr/local/include/boost/predef/compiler/diab.h"
+ "/usr/local/include/boost/predef/compiler/digitalmars.h"
+ "/usr/local/include/boost/predef/compiler/dignus.h"
+ "/usr/local/include/boost/predef/compiler/edg.h"
+ "/usr/local/include/boost/predef/compiler/ekopath.h"
+ "/usr/local/include/boost/predef/compiler/gcc.h"
+ "/usr/local/include/boost/predef/compiler/gcc_xml.h"
+ "/usr/local/include/boost/predef/compiler/greenhills.h"
+ "/usr/local/include/boost/predef/compiler/hp_acc.h"
+ "/usr/local/include/boost/predef/compiler/iar.h"
+ "/usr/local/include/boost/predef/compiler/ibm.h"
+ "/usr/local/include/boost/predef/compiler/intel.h"
+ "/usr/local/include/boost/predef/compiler/kai.h"
+ "/usr/local/include/boost/predef/compiler/llvm.h"
+ "/usr/local/include/boost/predef/compiler/metaware.h"
+ "/usr/local/include/boost/predef/compiler/metrowerks.h"
+ "/usr/local/include/boost/predef/compiler/microtec.h"
+ "/usr/local/include/boost/predef/compiler/mpw.h"
+ "/usr/local/include/boost/predef/compiler/palm.h"
+ "/usr/local/include/boost/predef/compiler/pgi.h"
+ "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
+ "/usr/local/include/boost/predef/compiler/sunpro.h"
+ "/usr/local/include/boost/predef/compiler/tendra.h"
+ "/usr/local/include/boost/predef/compiler/visualc.h"
+ "/usr/local/include/boost/predef/compiler/watcom.h"
+ "/usr/local/include/boost/predef/detail/_cassert.h"
+ "/usr/local/include/boost/predef/detail/_exception.h"
+ "/usr/local/include/boost/predef/detail/comp_detected.h"
+ "/usr/local/include/boost/predef/detail/os_detected.h"
+ "/usr/local/include/boost/predef/detail/test.h"
+ "/usr/local/include/boost/predef/language.h"
+ "/usr/local/include/boost/predef/language/objc.h"
+ "/usr/local/include/boost/predef/language/stdc.h"
+ "/usr/local/include/boost/predef/language/stdcpp.h"
+ "/usr/local/include/boost/predef/library.h"
+ "/usr/local/include/boost/predef/library/c.h"
+ "/usr/local/include/boost/predef/library/c/_prefix.h"
+ "/usr/local/include/boost/predef/library/c/gnu.h"
+ "/usr/local/include/boost/predef/library/c/uc.h"
+ "/usr/local/include/boost/predef/library/c/vms.h"
+ "/usr/local/include/boost/predef/library/c/zos.h"
+ "/usr/local/include/boost/predef/library/std.h"
+ "/usr/local/include/boost/predef/library/std/_prefix.h"
+ "/usr/local/include/boost/predef/library/std/cxx.h"
+ "/usr/local/include/boost/predef/library/std/dinkumware.h"
+ "/usr/local/include/boost/predef/library/std/libcomo.h"
+ "/usr/local/include/boost/predef/library/std/modena.h"
+ "/usr/local/include/boost/predef/library/std/msl.h"
+ "/usr/local/include/boost/predef/library/std/roguewave.h"
+ "/usr/local/include/boost/predef/library/std/sgi.h"
+ "/usr/local/include/boost/predef/library/std/stdcpp3.h"
+ "/usr/local/include/boost/predef/library/std/stlport.h"
+ "/usr/local/include/boost/predef/library/std/vacpp.h"
+ "/usr/local/include/boost/predef/make.h"
+ "/usr/local/include/boost/predef/os.h"
+ "/usr/local/include/boost/predef/os/aix.h"
+ "/usr/local/include/boost/predef/os/amigaos.h"
+ "/usr/local/include/boost/predef/os/android.h"
+ "/usr/local/include/boost/predef/os/beos.h"
+ "/usr/local/include/boost/predef/os/bsd.h"
+ "/usr/local/include/boost/predef/os/bsd/bsdi.h"
+ "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
+ "/usr/local/include/boost/predef/os/bsd/free.h"
+ "/usr/local/include/boost/predef/os/bsd/net.h"
+ "/usr/local/include/boost/predef/os/bsd/open.h"
+ "/usr/local/include/boost/predef/os/cygwin.h"
+ "/usr/local/include/boost/predef/os/hpux.h"
+ "/usr/local/include/boost/predef/os/ios.h"
+ "/usr/local/include/boost/predef/os/irix.h"
+ "/usr/local/include/boost/predef/os/linux.h"
+ "/usr/local/include/boost/predef/os/macos.h"
+ "/usr/local/include/boost/predef/os/os400.h"
+ "/usr/local/include/boost/predef/os/qnxnto.h"
+ "/usr/local/include/boost/predef/os/solaris.h"
+ "/usr/local/include/boost/predef/os/unix.h"
+ "/usr/local/include/boost/predef/os/vms.h"
+ "/usr/local/include/boost/predef/os/windows.h"
+ "/usr/local/include/boost/predef/other.h"
+ "/usr/local/include/boost/predef/other/endian.h"
+ "/usr/local/include/boost/predef/platform.h"
+ "/usr/local/include/boost/predef/platform/mingw.h"
+ "/usr/local/include/boost/predef/platform/windows_desktop.h"
+ "/usr/local/include/boost/predef/platform/windows_phone.h"
+ "/usr/local/include/boost/predef/platform/windows_runtime.h"
+ "/usr/local/include/boost/predef/platform/windows_store.h"
+ "/usr/local/include/boost/predef/version_number.h"
+ "/usr/local/include/boost/scoped_ptr.hpp"
+ "/usr/local/include/boost/shared_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
+ "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
+ "/usr/local/include/boost/throw_exception.hpp"
+ "/usr/local/include/gflags/gflags.h"
+ "/usr/local/include/gflags/gflags_declare.h"
+ "/usr/local/include/glog/log_severity.h"
+ "/usr/local/include/glog/logging.h"
+ "/usr/local/include/glog/vlog_is_on.h"
+)
+
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
new file mode 100644
index 00000000..d7dfae88
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
new file mode 100644
index 00000000..f9de6105
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
@@ -0,0 +1,468 @@
+# Generated by: make2cmake.cmake
+SET(CUDA_NVCC_DEPEND
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu"
+ "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
+ "/opt/clBLAS-private-april8/include/clBLAS.h"
+ "/usr/include/H5ACpublic.h"
+ "/usr/include/H5Apublic.h"
+ "/usr/include/H5Cpublic.h"
+ "/usr/include/H5Dpublic.h"
+ "/usr/include/H5Epubgen.h"
+ "/usr/include/H5Epublic.h"
+ "/usr/include/H5FDcore.h"
+ "/usr/include/H5FDdirect.h"
+ "/usr/include/H5FDfamily.h"
+ "/usr/include/H5FDlog.h"
+ "/usr/include/H5FDmpi.h"
+ "/usr/include/H5FDmpio.h"
+ "/usr/include/H5FDmpiposix.h"
+ "/usr/include/H5FDmulti.h"
+ "/usr/include/H5FDpublic.h"
+ "/usr/include/H5FDsec2.h"
+ "/usr/include/H5FDstdio.h"
+ "/usr/include/H5Fpublic.h"
+ "/usr/include/H5Gpublic.h"
+ "/usr/include/H5Ipublic.h"
+ "/usr/include/H5Lpublic.h"
+ "/usr/include/H5MMpublic.h"
+ "/usr/include/H5Opublic.h"
+ "/usr/include/H5Ppublic.h"
+ "/usr/include/H5Rpublic.h"
+ "/usr/include/H5Spublic.h"
+ "/usr/include/H5Tpublic.h"
+ "/usr/include/H5Zpublic.h"
+ "/usr/include/H5api_adpt.h"
+ "/usr/include/H5pubconf.h"
+ "/usr/include/H5public.h"
+ "/usr/include/H5version.h"
+ "/usr/include/_G_config.h"
+ "/usr/include/alloca.h"
+ "/usr/include/asm-generic/errno-base.h"
+ "/usr/include/asm-generic/errno.h"
+ "/usr/include/assert.h"
+ "/usr/include/atlas/cblas.h"
+ "/usr/include/c++/4.8/algorithm"
+ "/usr/include/c++/4.8/backward/auto_ptr.h"
+ "/usr/include/c++/4.8/backward/binders.h"
+ "/usr/include/c++/4.8/bits/algorithmfwd.h"
+ "/usr/include/c++/4.8/bits/allocator.h"
+ "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
+ "/usr/include/c++/4.8/bits/basic_ios.h"
+ "/usr/include/c++/4.8/bits/basic_ios.tcc"
+ "/usr/include/c++/4.8/bits/basic_string.h"
+ "/usr/include/c++/4.8/bits/basic_string.tcc"
+ "/usr/include/c++/4.8/bits/char_traits.h"
+ "/usr/include/c++/4.8/bits/codecvt.h"
+ "/usr/include/c++/4.8/bits/concept_check.h"
+ "/usr/include/c++/4.8/bits/cpp_type_traits.h"
+ "/usr/include/c++/4.8/bits/cxxabi_forced.h"
+ "/usr/include/c++/4.8/bits/exception_defines.h"
+ "/usr/include/c++/4.8/bits/fstream.tcc"
+ "/usr/include/c++/4.8/bits/functexcept.h"
+ "/usr/include/c++/4.8/bits/ios_base.h"
+ "/usr/include/c++/4.8/bits/istream.tcc"
+ "/usr/include/c++/4.8/bits/locale_classes.h"
+ "/usr/include/c++/4.8/bits/locale_classes.tcc"
+ "/usr/include/c++/4.8/bits/locale_facets.h"
+ "/usr/include/c++/4.8/bits/locale_facets.tcc"
+ "/usr/include/c++/4.8/bits/localefwd.h"
+ "/usr/include/c++/4.8/bits/memoryfwd.h"
+ "/usr/include/c++/4.8/bits/move.h"
+ "/usr/include/c++/4.8/bits/ostream.tcc"
+ "/usr/include/c++/4.8/bits/ostream_insert.h"
+ "/usr/include/c++/4.8/bits/postypes.h"
+ "/usr/include/c++/4.8/bits/range_access.h"
+ "/usr/include/c++/4.8/bits/sstream.tcc"
+ "/usr/include/c++/4.8/bits/stl_algo.h"
+ "/usr/include/c++/4.8/bits/stl_algobase.h"
+ "/usr/include/c++/4.8/bits/stl_bvector.h"
+ "/usr/include/c++/4.8/bits/stl_construct.h"
+ "/usr/include/c++/4.8/bits/stl_function.h"
+ "/usr/include/c++/4.8/bits/stl_heap.h"
+ "/usr/include/c++/4.8/bits/stl_iterator.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
+ "/usr/include/c++/4.8/bits/stl_map.h"
+ "/usr/include/c++/4.8/bits/stl_multimap.h"
+ "/usr/include/c++/4.8/bits/stl_multiset.h"
+ "/usr/include/c++/4.8/bits/stl_pair.h"
+ "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
+ "/usr/include/c++/4.8/bits/stl_relops.h"
+ "/usr/include/c++/4.8/bits/stl_set.h"
+ "/usr/include/c++/4.8/bits/stl_tempbuf.h"
+ "/usr/include/c++/4.8/bits/stl_tree.h"
+ "/usr/include/c++/4.8/bits/stl_uninitialized.h"
+ "/usr/include/c++/4.8/bits/stl_vector.h"
+ "/usr/include/c++/4.8/bits/stream_iterator.h"
+ "/usr/include/c++/4.8/bits/streambuf.tcc"
+ "/usr/include/c++/4.8/bits/streambuf_iterator.h"
+ "/usr/include/c++/4.8/bits/stringfwd.h"
+ "/usr/include/c++/4.8/bits/vector.tcc"
+ "/usr/include/c++/4.8/cctype"
+ "/usr/include/c++/4.8/climits"
+ "/usr/include/c++/4.8/clocale"
+ "/usr/include/c++/4.8/cmath"
+ "/usr/include/c++/4.8/cstddef"
+ "/usr/include/c++/4.8/cstdio"
+ "/usr/include/c++/4.8/cstdlib"
+ "/usr/include/c++/4.8/cwchar"
+ "/usr/include/c++/4.8/cwctype"
+ "/usr/include/c++/4.8/cxxabi.h"
+ "/usr/include/c++/4.8/debug/debug.h"
+ "/usr/include/c++/4.8/exception"
+ "/usr/include/c++/4.8/ext/alloc_traits.h"
+ "/usr/include/c++/4.8/ext/atomicity.h"
+ "/usr/include/c++/4.8/ext/new_allocator.h"
+ "/usr/include/c++/4.8/ext/numeric_traits.h"
+ "/usr/include/c++/4.8/ext/type_traits.h"
+ "/usr/include/c++/4.8/fstream"
+ "/usr/include/c++/4.8/functional"
+ "/usr/include/c++/4.8/ios"
+ "/usr/include/c++/4.8/iosfwd"
+ "/usr/include/c++/4.8/iostream"
+ "/usr/include/c++/4.8/istream"
+ "/usr/include/c++/4.8/iterator"
+ "/usr/include/c++/4.8/map"
+ "/usr/include/c++/4.8/memory"
+ "/usr/include/c++/4.8/new"
+ "/usr/include/c++/4.8/ostream"
+ "/usr/include/c++/4.8/set"
+ "/usr/include/c++/4.8/sstream"
+ "/usr/include/c++/4.8/streambuf"
+ "/usr/include/c++/4.8/string"
+ "/usr/include/c++/4.8/typeinfo"
+ "/usr/include/c++/4.8/utility"
+ "/usr/include/c++/4.8/vector"
+ "/usr/include/ctype.h"
+ "/usr/include/endian.h"
+ "/usr/include/errno.h"
+ "/usr/include/features.h"
+ "/usr/include/getopt.h"
+ "/usr/include/google/protobuf/descriptor.h"
+ "/usr/include/google/protobuf/extension_set.h"
+ "/usr/include/google/protobuf/generated_enum_reflection.h"
+ "/usr/include/google/protobuf/generated_message_util.h"
+ "/usr/include/google/protobuf/message.h"
+ "/usr/include/google/protobuf/message_lite.h"
+ "/usr/include/google/protobuf/repeated_field.h"
+ "/usr/include/google/protobuf/stubs/common.h"
+ "/usr/include/google/protobuf/stubs/template_util.h"
+ "/usr/include/google/protobuf/stubs/type_traits.h"
+ "/usr/include/google/protobuf/unknown_field_set.h"
+ "/usr/include/hdf5.h"
+ "/usr/include/inttypes.h"
+ "/usr/include/libio.h"
+ "/usr/include/limits.h"
+ "/usr/include/linux/errno.h"
+ "/usr/include/linux/limits.h"
+ "/usr/include/locale.h"
+ "/usr/include/math.h"
+ "/usr/include/pthread.h"
+ "/usr/include/sched.h"
+ "/usr/include/stdc-predef.h"
+ "/usr/include/stdint.h"
+ "/usr/include/stdio.h"
+ "/usr/include/stdlib.h"
+ "/usr/include/string.h"
+ "/usr/include/time.h"
+ "/usr/include/unistd.h"
+ "/usr/include/wchar.h"
+ "/usr/include/wctype.h"
+ "/usr/include/x86_64-linux-gnu/asm/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
+ "/usr/include/x86_64-linux-gnu/bits/confname.h"
+ "/usr/include/x86_64-linux-gnu/bits/endian.h"
+ "/usr/include/x86_64-linux-gnu/bits/environments.h"
+ "/usr/include/x86_64-linux-gnu/bits/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
+ "/usr/include/x86_64-linux-gnu/bits/inf.h"
+ "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/locale.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
+ "/usr/include/x86_64-linux-gnu/bits/nan.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
+ "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
+ "/usr/include/x86_64-linux-gnu/bits/sched.h"
+ "/usr/include/x86_64-linux-gnu/bits/select.h"
+ "/usr/include/x86_64-linux-gnu/bits/select2.h"
+ "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
+ "/usr/include/x86_64-linux-gnu/bits/sigset.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
+ "/usr/include/x86_64-linux-gnu/bits/string3.h"
+ "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
+ "/usr/include/x86_64-linux-gnu/bits/time.h"
+ "/usr/include/x86_64-linux-gnu/bits/timex.h"
+ "/usr/include/x86_64-linux-gnu/bits/types.h"
+ "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
+ "/usr/include/x86_64-linux-gnu/bits/unistd.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
+ "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
+ "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
+ "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
+ "/usr/include/x86_64-linux-gnu/sys/select.h"
+ "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
+ "/usr/include/x86_64-linux-gnu/sys/types.h"
+ "/usr/include/xlocale.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
+ "/usr/local/cuda-6.5/include/CL/cl.h"
+ "/usr/local/cuda-6.5/include/CL/cl_ext.h"
+ "/usr/local/cuda-6.5/include/CL/cl_platform.h"
+ "/usr/local/cuda-6.5/include/builtin_types.h"
+ "/usr/local/cuda-6.5/include/channel_descriptor.h"
+ "/usr/local/cuda-6.5/include/common_functions.h"
+ "/usr/local/cuda-6.5/include/cuComplex.h"
+ "/usr/local/cuda-6.5/include/cublas_api.h"
+ "/usr/local/cuda-6.5/include/cublas_v2.h"
+ "/usr/local/cuda-6.5/include/cuda.h"
+ "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_surface_types.h"
+ "/usr/local/cuda-6.5/include/cuda_texture_types.h"
+ "/usr/local/cuda-6.5/include/curand.h"
+ "/usr/local/cuda-6.5/include/device_functions.h"
+ "/usr/local/cuda-6.5/include/device_launch_parameters.h"
+ "/usr/local/cuda-6.5/include/device_types.h"
+ "/usr/local/cuda-6.5/include/driver_functions.h"
+ "/usr/local/cuda-6.5/include/driver_types.h"
+ "/usr/local/cuda-6.5/include/host_config.h"
+ "/usr/local/cuda-6.5/include/host_defines.h"
+ "/usr/local/cuda-6.5/include/math_functions.h"
+ "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
+ "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
+ "/usr/local/cuda-6.5/include/surface_functions.h"
+ "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/surface_types.h"
+ "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
+ "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/texture_types.h"
+ "/usr/local/cuda-6.5/include/vector_functions.h"
+ "/usr/local/cuda-6.5/include/vector_types.h"
+ "/usr/local/include/boost/assert.hpp"
+ "/usr/local/include/boost/checked_delete.hpp"
+ "/usr/local/include/boost/config.hpp"
+ "/usr/local/include/boost/config/compiler/gcc.hpp"
+ "/usr/local/include/boost/config/compiler/nvcc.hpp"
+ "/usr/local/include/boost/config/no_tr1/memory.hpp"
+ "/usr/local/include/boost/config/no_tr1/utility.hpp"
+ "/usr/local/include/boost/config/platform/linux.hpp"
+ "/usr/local/include/boost/config/posix_features.hpp"
+ "/usr/local/include/boost/config/select_compiler_config.hpp"
+ "/usr/local/include/boost/config/select_platform_config.hpp"
+ "/usr/local/include/boost/config/select_stdlib_config.hpp"
+ "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
+ "/usr/local/include/boost/config/suffix.hpp"
+ "/usr/local/include/boost/config/user.hpp"
+ "/usr/local/include/boost/core/checked_delete.hpp"
+ "/usr/local/include/boost/core/demangle.hpp"
+ "/usr/local/include/boost/core/typeinfo.hpp"
+ "/usr/local/include/boost/current_function.hpp"
+ "/usr/local/include/boost/detail/sp_typeinfo.hpp"
+ "/usr/local/include/boost/detail/workaround.hpp"
+ "/usr/local/include/boost/exception/exception.hpp"
+ "/usr/local/include/boost/predef.h"
+ "/usr/local/include/boost/predef/architecture.h"
+ "/usr/local/include/boost/predef/architecture/alpha.h"
+ "/usr/local/include/boost/predef/architecture/arm.h"
+ "/usr/local/include/boost/predef/architecture/blackfin.h"
+ "/usr/local/include/boost/predef/architecture/convex.h"
+ "/usr/local/include/boost/predef/architecture/ia64.h"
+ "/usr/local/include/boost/predef/architecture/m68k.h"
+ "/usr/local/include/boost/predef/architecture/mips.h"
+ "/usr/local/include/boost/predef/architecture/parisc.h"
+ "/usr/local/include/boost/predef/architecture/ppc.h"
+ "/usr/local/include/boost/predef/architecture/pyramid.h"
+ "/usr/local/include/boost/predef/architecture/rs6k.h"
+ "/usr/local/include/boost/predef/architecture/sparc.h"
+ "/usr/local/include/boost/predef/architecture/superh.h"
+ "/usr/local/include/boost/predef/architecture/sys370.h"
+ "/usr/local/include/boost/predef/architecture/sys390.h"
+ "/usr/local/include/boost/predef/architecture/x86.h"
+ "/usr/local/include/boost/predef/architecture/x86/32.h"
+ "/usr/local/include/boost/predef/architecture/x86/64.h"
+ "/usr/local/include/boost/predef/architecture/z.h"
+ "/usr/local/include/boost/predef/compiler.h"
+ "/usr/local/include/boost/predef/compiler/borland.h"
+ "/usr/local/include/boost/predef/compiler/clang.h"
+ "/usr/local/include/boost/predef/compiler/comeau.h"
+ "/usr/local/include/boost/predef/compiler/compaq.h"
+ "/usr/local/include/boost/predef/compiler/diab.h"
+ "/usr/local/include/boost/predef/compiler/digitalmars.h"
+ "/usr/local/include/boost/predef/compiler/dignus.h"
+ "/usr/local/include/boost/predef/compiler/edg.h"
+ "/usr/local/include/boost/predef/compiler/ekopath.h"
+ "/usr/local/include/boost/predef/compiler/gcc.h"
+ "/usr/local/include/boost/predef/compiler/gcc_xml.h"
+ "/usr/local/include/boost/predef/compiler/greenhills.h"
+ "/usr/local/include/boost/predef/compiler/hp_acc.h"
+ "/usr/local/include/boost/predef/compiler/iar.h"
+ "/usr/local/include/boost/predef/compiler/ibm.h"
+ "/usr/local/include/boost/predef/compiler/intel.h"
+ "/usr/local/include/boost/predef/compiler/kai.h"
+ "/usr/local/include/boost/predef/compiler/llvm.h"
+ "/usr/local/include/boost/predef/compiler/metaware.h"
+ "/usr/local/include/boost/predef/compiler/metrowerks.h"
+ "/usr/local/include/boost/predef/compiler/microtec.h"
+ "/usr/local/include/boost/predef/compiler/mpw.h"
+ "/usr/local/include/boost/predef/compiler/palm.h"
+ "/usr/local/include/boost/predef/compiler/pgi.h"
+ "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
+ "/usr/local/include/boost/predef/compiler/sunpro.h"
+ "/usr/local/include/boost/predef/compiler/tendra.h"
+ "/usr/local/include/boost/predef/compiler/visualc.h"
+ "/usr/local/include/boost/predef/compiler/watcom.h"
+ "/usr/local/include/boost/predef/detail/_cassert.h"
+ "/usr/local/include/boost/predef/detail/_exception.h"
+ "/usr/local/include/boost/predef/detail/comp_detected.h"
+ "/usr/local/include/boost/predef/detail/os_detected.h"
+ "/usr/local/include/boost/predef/detail/test.h"
+ "/usr/local/include/boost/predef/language.h"
+ "/usr/local/include/boost/predef/language/objc.h"
+ "/usr/local/include/boost/predef/language/stdc.h"
+ "/usr/local/include/boost/predef/language/stdcpp.h"
+ "/usr/local/include/boost/predef/library.h"
+ "/usr/local/include/boost/predef/library/c.h"
+ "/usr/local/include/boost/predef/library/c/_prefix.h"
+ "/usr/local/include/boost/predef/library/c/gnu.h"
+ "/usr/local/include/boost/predef/library/c/uc.h"
+ "/usr/local/include/boost/predef/library/c/vms.h"
+ "/usr/local/include/boost/predef/library/c/zos.h"
+ "/usr/local/include/boost/predef/library/std.h"
+ "/usr/local/include/boost/predef/library/std/_prefix.h"
+ "/usr/local/include/boost/predef/library/std/cxx.h"
+ "/usr/local/include/boost/predef/library/std/dinkumware.h"
+ "/usr/local/include/boost/predef/library/std/libcomo.h"
+ "/usr/local/include/boost/predef/library/std/modena.h"
+ "/usr/local/include/boost/predef/library/std/msl.h"
+ "/usr/local/include/boost/predef/library/std/roguewave.h"
+ "/usr/local/include/boost/predef/library/std/sgi.h"
+ "/usr/local/include/boost/predef/library/std/stdcpp3.h"
+ "/usr/local/include/boost/predef/library/std/stlport.h"
+ "/usr/local/include/boost/predef/library/std/vacpp.h"
+ "/usr/local/include/boost/predef/make.h"
+ "/usr/local/include/boost/predef/os.h"
+ "/usr/local/include/boost/predef/os/aix.h"
+ "/usr/local/include/boost/predef/os/amigaos.h"
+ "/usr/local/include/boost/predef/os/android.h"
+ "/usr/local/include/boost/predef/os/beos.h"
+ "/usr/local/include/boost/predef/os/bsd.h"
+ "/usr/local/include/boost/predef/os/bsd/bsdi.h"
+ "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
+ "/usr/local/include/boost/predef/os/bsd/free.h"
+ "/usr/local/include/boost/predef/os/bsd/net.h"
+ "/usr/local/include/boost/predef/os/bsd/open.h"
+ "/usr/local/include/boost/predef/os/cygwin.h"
+ "/usr/local/include/boost/predef/os/hpux.h"
+ "/usr/local/include/boost/predef/os/ios.h"
+ "/usr/local/include/boost/predef/os/irix.h"
+ "/usr/local/include/boost/predef/os/linux.h"
+ "/usr/local/include/boost/predef/os/macos.h"
+ "/usr/local/include/boost/predef/os/os400.h"
+ "/usr/local/include/boost/predef/os/qnxnto.h"
+ "/usr/local/include/boost/predef/os/solaris.h"
+ "/usr/local/include/boost/predef/os/unix.h"
+ "/usr/local/include/boost/predef/os/vms.h"
+ "/usr/local/include/boost/predef/os/windows.h"
+ "/usr/local/include/boost/predef/other.h"
+ "/usr/local/include/boost/predef/other/endian.h"
+ "/usr/local/include/boost/predef/platform.h"
+ "/usr/local/include/boost/predef/platform/mingw.h"
+ "/usr/local/include/boost/predef/platform/windows_desktop.h"
+ "/usr/local/include/boost/predef/platform/windows_phone.h"
+ "/usr/local/include/boost/predef/platform/windows_runtime.h"
+ "/usr/local/include/boost/predef/platform/windows_store.h"
+ "/usr/local/include/boost/predef/version_number.h"
+ "/usr/local/include/boost/scoped_ptr.hpp"
+ "/usr/local/include/boost/shared_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
+ "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
+ "/usr/local/include/boost/throw_exception.hpp"
+ "/usr/local/include/gflags/gflags.h"
+ "/usr/local/include/gflags/gflags_declare.h"
+ "/usr/local/include/glog/log_severity.h"
+ "/usr/local/include/glog/logging.h"
+ "/usr/local/include/glog/vlog_is_on.h"
+)
+
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
new file mode 100644
index 00000000..dd2453ae
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/silence_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
new file mode 100644
index 00000000..990e0622
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/slice_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
new file mode 100644
index 00000000..ebf29ea2
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
new file mode 100644
index 00000000..6260b6e0
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_loss_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
new file mode 100644
index 00000000..ad49afe7
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/split_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
new file mode 100644
index 00000000..71fc8fdb
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/tanh_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
new file mode 100644
index 00000000..4e18059a
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/threshold_layer.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
new file mode 100644
index 00000000..8de5e27c
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
new file mode 100644
index 00000000..36db02fe
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
@@ -0,0 +1,404 @@
+# Generated by: make2cmake.cmake
+SET(CUDA_NVCC_DEPEND
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu"
+ "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
+ "/opt/clBLAS-private-april8/include/clBLAS.h"
+ "/usr/include/_G_config.h"
+ "/usr/include/alloca.h"
+ "/usr/include/asm-generic/errno-base.h"
+ "/usr/include/asm-generic/errno.h"
+ "/usr/include/assert.h"
+ "/usr/include/c++/4.8/algorithm"
+ "/usr/include/c++/4.8/backward/auto_ptr.h"
+ "/usr/include/c++/4.8/backward/binders.h"
+ "/usr/include/c++/4.8/bits/algorithmfwd.h"
+ "/usr/include/c++/4.8/bits/allocator.h"
+ "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
+ "/usr/include/c++/4.8/bits/basic_ios.h"
+ "/usr/include/c++/4.8/bits/basic_ios.tcc"
+ "/usr/include/c++/4.8/bits/basic_string.h"
+ "/usr/include/c++/4.8/bits/basic_string.tcc"
+ "/usr/include/c++/4.8/bits/char_traits.h"
+ "/usr/include/c++/4.8/bits/codecvt.h"
+ "/usr/include/c++/4.8/bits/concept_check.h"
+ "/usr/include/c++/4.8/bits/cpp_type_traits.h"
+ "/usr/include/c++/4.8/bits/cxxabi_forced.h"
+ "/usr/include/c++/4.8/bits/exception_defines.h"
+ "/usr/include/c++/4.8/bits/fstream.tcc"
+ "/usr/include/c++/4.8/bits/functexcept.h"
+ "/usr/include/c++/4.8/bits/ios_base.h"
+ "/usr/include/c++/4.8/bits/istream.tcc"
+ "/usr/include/c++/4.8/bits/locale_classes.h"
+ "/usr/include/c++/4.8/bits/locale_classes.tcc"
+ "/usr/include/c++/4.8/bits/locale_facets.h"
+ "/usr/include/c++/4.8/bits/locale_facets.tcc"
+ "/usr/include/c++/4.8/bits/localefwd.h"
+ "/usr/include/c++/4.8/bits/memoryfwd.h"
+ "/usr/include/c++/4.8/bits/move.h"
+ "/usr/include/c++/4.8/bits/ostream.tcc"
+ "/usr/include/c++/4.8/bits/ostream_insert.h"
+ "/usr/include/c++/4.8/bits/postypes.h"
+ "/usr/include/c++/4.8/bits/range_access.h"
+ "/usr/include/c++/4.8/bits/sstream.tcc"
+ "/usr/include/c++/4.8/bits/stl_algo.h"
+ "/usr/include/c++/4.8/bits/stl_algobase.h"
+ "/usr/include/c++/4.8/bits/stl_bvector.h"
+ "/usr/include/c++/4.8/bits/stl_construct.h"
+ "/usr/include/c++/4.8/bits/stl_function.h"
+ "/usr/include/c++/4.8/bits/stl_heap.h"
+ "/usr/include/c++/4.8/bits/stl_iterator.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
+ "/usr/include/c++/4.8/bits/stl_map.h"
+ "/usr/include/c++/4.8/bits/stl_multimap.h"
+ "/usr/include/c++/4.8/bits/stl_multiset.h"
+ "/usr/include/c++/4.8/bits/stl_pair.h"
+ "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
+ "/usr/include/c++/4.8/bits/stl_relops.h"
+ "/usr/include/c++/4.8/bits/stl_set.h"
+ "/usr/include/c++/4.8/bits/stl_tempbuf.h"
+ "/usr/include/c++/4.8/bits/stl_tree.h"
+ "/usr/include/c++/4.8/bits/stl_uninitialized.h"
+ "/usr/include/c++/4.8/bits/stl_vector.h"
+ "/usr/include/c++/4.8/bits/streambuf.tcc"
+ "/usr/include/c++/4.8/bits/streambuf_iterator.h"
+ "/usr/include/c++/4.8/bits/stringfwd.h"
+ "/usr/include/c++/4.8/bits/vector.tcc"
+ "/usr/include/c++/4.8/cctype"
+ "/usr/include/c++/4.8/climits"
+ "/usr/include/c++/4.8/clocale"
+ "/usr/include/c++/4.8/cmath"
+ "/usr/include/c++/4.8/cstddef"
+ "/usr/include/c++/4.8/cstdio"
+ "/usr/include/c++/4.8/cstdlib"
+ "/usr/include/c++/4.8/cstring"
+ "/usr/include/c++/4.8/cwchar"
+ "/usr/include/c++/4.8/cwctype"
+ "/usr/include/c++/4.8/cxxabi.h"
+ "/usr/include/c++/4.8/debug/debug.h"
+ "/usr/include/c++/4.8/exception"
+ "/usr/include/c++/4.8/ext/alloc_traits.h"
+ "/usr/include/c++/4.8/ext/atomicity.h"
+ "/usr/include/c++/4.8/ext/new_allocator.h"
+ "/usr/include/c++/4.8/ext/numeric_traits.h"
+ "/usr/include/c++/4.8/ext/type_traits.h"
+ "/usr/include/c++/4.8/fstream"
+ "/usr/include/c++/4.8/functional"
+ "/usr/include/c++/4.8/ios"
+ "/usr/include/c++/4.8/iosfwd"
+ "/usr/include/c++/4.8/iostream"
+ "/usr/include/c++/4.8/istream"
+ "/usr/include/c++/4.8/map"
+ "/usr/include/c++/4.8/memory"
+ "/usr/include/c++/4.8/new"
+ "/usr/include/c++/4.8/ostream"
+ "/usr/include/c++/4.8/set"
+ "/usr/include/c++/4.8/sstream"
+ "/usr/include/c++/4.8/streambuf"
+ "/usr/include/c++/4.8/string"
+ "/usr/include/c++/4.8/typeinfo"
+ "/usr/include/c++/4.8/utility"
+ "/usr/include/c++/4.8/vector"
+ "/usr/include/ctype.h"
+ "/usr/include/endian.h"
+ "/usr/include/errno.h"
+ "/usr/include/features.h"
+ "/usr/include/getopt.h"
+ "/usr/include/inttypes.h"
+ "/usr/include/libio.h"
+ "/usr/include/limits.h"
+ "/usr/include/linux/errno.h"
+ "/usr/include/linux/limits.h"
+ "/usr/include/locale.h"
+ "/usr/include/math.h"
+ "/usr/include/pthread.h"
+ "/usr/include/sched.h"
+ "/usr/include/stdc-predef.h"
+ "/usr/include/stdint.h"
+ "/usr/include/stdio.h"
+ "/usr/include/stdlib.h"
+ "/usr/include/string.h"
+ "/usr/include/time.h"
+ "/usr/include/unistd.h"
+ "/usr/include/wchar.h"
+ "/usr/include/wctype.h"
+ "/usr/include/x86_64-linux-gnu/asm/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
+ "/usr/include/x86_64-linux-gnu/bits/confname.h"
+ "/usr/include/x86_64-linux-gnu/bits/endian.h"
+ "/usr/include/x86_64-linux-gnu/bits/environments.h"
+ "/usr/include/x86_64-linux-gnu/bits/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
+ "/usr/include/x86_64-linux-gnu/bits/inf.h"
+ "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/locale.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
+ "/usr/include/x86_64-linux-gnu/bits/nan.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
+ "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
+ "/usr/include/x86_64-linux-gnu/bits/sched.h"
+ "/usr/include/x86_64-linux-gnu/bits/select.h"
+ "/usr/include/x86_64-linux-gnu/bits/select2.h"
+ "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
+ "/usr/include/x86_64-linux-gnu/bits/sigset.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
+ "/usr/include/x86_64-linux-gnu/bits/string3.h"
+ "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
+ "/usr/include/x86_64-linux-gnu/bits/time.h"
+ "/usr/include/x86_64-linux-gnu/bits/timex.h"
+ "/usr/include/x86_64-linux-gnu/bits/types.h"
+ "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
+ "/usr/include/x86_64-linux-gnu/bits/unistd.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
+ "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
+ "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
+ "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
+ "/usr/include/x86_64-linux-gnu/sys/select.h"
+ "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
+ "/usr/include/x86_64-linux-gnu/sys/types.h"
+ "/usr/include/xlocale.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
+ "/usr/local/cuda-6.5/include/CL/cl.h"
+ "/usr/local/cuda-6.5/include/CL/cl_ext.h"
+ "/usr/local/cuda-6.5/include/CL/cl_platform.h"
+ "/usr/local/cuda-6.5/include/builtin_types.h"
+ "/usr/local/cuda-6.5/include/channel_descriptor.h"
+ "/usr/local/cuda-6.5/include/common_functions.h"
+ "/usr/local/cuda-6.5/include/cuComplex.h"
+ "/usr/local/cuda-6.5/include/cublas_api.h"
+ "/usr/local/cuda-6.5/include/cublas_v2.h"
+ "/usr/local/cuda-6.5/include/cuda.h"
+ "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_surface_types.h"
+ "/usr/local/cuda-6.5/include/cuda_texture_types.h"
+ "/usr/local/cuda-6.5/include/curand.h"
+ "/usr/local/cuda-6.5/include/device_functions.h"
+ "/usr/local/cuda-6.5/include/device_launch_parameters.h"
+ "/usr/local/cuda-6.5/include/device_types.h"
+ "/usr/local/cuda-6.5/include/driver_functions.h"
+ "/usr/local/cuda-6.5/include/driver_types.h"
+ "/usr/local/cuda-6.5/include/host_config.h"
+ "/usr/local/cuda-6.5/include/host_defines.h"
+ "/usr/local/cuda-6.5/include/math_functions.h"
+ "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
+ "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
+ "/usr/local/cuda-6.5/include/surface_functions.h"
+ "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/surface_types.h"
+ "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
+ "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/texture_types.h"
+ "/usr/local/cuda-6.5/include/vector_functions.h"
+ "/usr/local/cuda-6.5/include/vector_types.h"
+ "/usr/local/include/boost/assert.hpp"
+ "/usr/local/include/boost/checked_delete.hpp"
+ "/usr/local/include/boost/config.hpp"
+ "/usr/local/include/boost/config/compiler/gcc.hpp"
+ "/usr/local/include/boost/config/compiler/nvcc.hpp"
+ "/usr/local/include/boost/config/no_tr1/memory.hpp"
+ "/usr/local/include/boost/config/no_tr1/utility.hpp"
+ "/usr/local/include/boost/config/platform/linux.hpp"
+ "/usr/local/include/boost/config/posix_features.hpp"
+ "/usr/local/include/boost/config/select_compiler_config.hpp"
+ "/usr/local/include/boost/config/select_platform_config.hpp"
+ "/usr/local/include/boost/config/select_stdlib_config.hpp"
+ "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
+ "/usr/local/include/boost/config/suffix.hpp"
+ "/usr/local/include/boost/config/user.hpp"
+ "/usr/local/include/boost/core/checked_delete.hpp"
+ "/usr/local/include/boost/core/demangle.hpp"
+ "/usr/local/include/boost/core/typeinfo.hpp"
+ "/usr/local/include/boost/current_function.hpp"
+ "/usr/local/include/boost/detail/sp_typeinfo.hpp"
+ "/usr/local/include/boost/detail/workaround.hpp"
+ "/usr/local/include/boost/exception/exception.hpp"
+ "/usr/local/include/boost/predef.h"
+ "/usr/local/include/boost/predef/architecture.h"
+ "/usr/local/include/boost/predef/architecture/alpha.h"
+ "/usr/local/include/boost/predef/architecture/arm.h"
+ "/usr/local/include/boost/predef/architecture/blackfin.h"
+ "/usr/local/include/boost/predef/architecture/convex.h"
+ "/usr/local/include/boost/predef/architecture/ia64.h"
+ "/usr/local/include/boost/predef/architecture/m68k.h"
+ "/usr/local/include/boost/predef/architecture/mips.h"
+ "/usr/local/include/boost/predef/architecture/parisc.h"
+ "/usr/local/include/boost/predef/architecture/ppc.h"
+ "/usr/local/include/boost/predef/architecture/pyramid.h"
+ "/usr/local/include/boost/predef/architecture/rs6k.h"
+ "/usr/local/include/boost/predef/architecture/sparc.h"
+ "/usr/local/include/boost/predef/architecture/superh.h"
+ "/usr/local/include/boost/predef/architecture/sys370.h"
+ "/usr/local/include/boost/predef/architecture/sys390.h"
+ "/usr/local/include/boost/predef/architecture/x86.h"
+ "/usr/local/include/boost/predef/architecture/x86/32.h"
+ "/usr/local/include/boost/predef/architecture/x86/64.h"
+ "/usr/local/include/boost/predef/architecture/z.h"
+ "/usr/local/include/boost/predef/compiler.h"
+ "/usr/local/include/boost/predef/compiler/borland.h"
+ "/usr/local/include/boost/predef/compiler/clang.h"
+ "/usr/local/include/boost/predef/compiler/comeau.h"
+ "/usr/local/include/boost/predef/compiler/compaq.h"
+ "/usr/local/include/boost/predef/compiler/diab.h"
+ "/usr/local/include/boost/predef/compiler/digitalmars.h"
+ "/usr/local/include/boost/predef/compiler/dignus.h"
+ "/usr/local/include/boost/predef/compiler/edg.h"
+ "/usr/local/include/boost/predef/compiler/ekopath.h"
+ "/usr/local/include/boost/predef/compiler/gcc.h"
+ "/usr/local/include/boost/predef/compiler/gcc_xml.h"
+ "/usr/local/include/boost/predef/compiler/greenhills.h"
+ "/usr/local/include/boost/predef/compiler/hp_acc.h"
+ "/usr/local/include/boost/predef/compiler/iar.h"
+ "/usr/local/include/boost/predef/compiler/ibm.h"
+ "/usr/local/include/boost/predef/compiler/intel.h"
+ "/usr/local/include/boost/predef/compiler/kai.h"
+ "/usr/local/include/boost/predef/compiler/llvm.h"
+ "/usr/local/include/boost/predef/compiler/metaware.h"
+ "/usr/local/include/boost/predef/compiler/metrowerks.h"
+ "/usr/local/include/boost/predef/compiler/microtec.h"
+ "/usr/local/include/boost/predef/compiler/mpw.h"
+ "/usr/local/include/boost/predef/compiler/palm.h"
+ "/usr/local/include/boost/predef/compiler/pgi.h"
+ "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
+ "/usr/local/include/boost/predef/compiler/sunpro.h"
+ "/usr/local/include/boost/predef/compiler/tendra.h"
+ "/usr/local/include/boost/predef/compiler/visualc.h"
+ "/usr/local/include/boost/predef/compiler/watcom.h"
+ "/usr/local/include/boost/predef/detail/_cassert.h"
+ "/usr/local/include/boost/predef/detail/_exception.h"
+ "/usr/local/include/boost/predef/detail/comp_detected.h"
+ "/usr/local/include/boost/predef/detail/os_detected.h"
+ "/usr/local/include/boost/predef/detail/test.h"
+ "/usr/local/include/boost/predef/language.h"
+ "/usr/local/include/boost/predef/language/objc.h"
+ "/usr/local/include/boost/predef/language/stdc.h"
+ "/usr/local/include/boost/predef/language/stdcpp.h"
+ "/usr/local/include/boost/predef/library.h"
+ "/usr/local/include/boost/predef/library/c.h"
+ "/usr/local/include/boost/predef/library/c/_prefix.h"
+ "/usr/local/include/boost/predef/library/c/gnu.h"
+ "/usr/local/include/boost/predef/library/c/uc.h"
+ "/usr/local/include/boost/predef/library/c/vms.h"
+ "/usr/local/include/boost/predef/library/c/zos.h"
+ "/usr/local/include/boost/predef/library/std.h"
+ "/usr/local/include/boost/predef/library/std/_prefix.h"
+ "/usr/local/include/boost/predef/library/std/cxx.h"
+ "/usr/local/include/boost/predef/library/std/dinkumware.h"
+ "/usr/local/include/boost/predef/library/std/libcomo.h"
+ "/usr/local/include/boost/predef/library/std/modena.h"
+ "/usr/local/include/boost/predef/library/std/msl.h"
+ "/usr/local/include/boost/predef/library/std/roguewave.h"
+ "/usr/local/include/boost/predef/library/std/sgi.h"
+ "/usr/local/include/boost/predef/library/std/stdcpp3.h"
+ "/usr/local/include/boost/predef/library/std/stlport.h"
+ "/usr/local/include/boost/predef/library/std/vacpp.h"
+ "/usr/local/include/boost/predef/make.h"
+ "/usr/local/include/boost/predef/os.h"
+ "/usr/local/include/boost/predef/os/aix.h"
+ "/usr/local/include/boost/predef/os/amigaos.h"
+ "/usr/local/include/boost/predef/os/android.h"
+ "/usr/local/include/boost/predef/os/beos.h"
+ "/usr/local/include/boost/predef/os/bsd.h"
+ "/usr/local/include/boost/predef/os/bsd/bsdi.h"
+ "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
+ "/usr/local/include/boost/predef/os/bsd/free.h"
+ "/usr/local/include/boost/predef/os/bsd/net.h"
+ "/usr/local/include/boost/predef/os/bsd/open.h"
+ "/usr/local/include/boost/predef/os/cygwin.h"
+ "/usr/local/include/boost/predef/os/hpux.h"
+ "/usr/local/include/boost/predef/os/ios.h"
+ "/usr/local/include/boost/predef/os/irix.h"
+ "/usr/local/include/boost/predef/os/linux.h"
+ "/usr/local/include/boost/predef/os/macos.h"
+ "/usr/local/include/boost/predef/os/os400.h"
+ "/usr/local/include/boost/predef/os/qnxnto.h"
+ "/usr/local/include/boost/predef/os/solaris.h"
+ "/usr/local/include/boost/predef/os/unix.h"
+ "/usr/local/include/boost/predef/os/vms.h"
+ "/usr/local/include/boost/predef/os/windows.h"
+ "/usr/local/include/boost/predef/other.h"
+ "/usr/local/include/boost/predef/other/endian.h"
+ "/usr/local/include/boost/predef/platform.h"
+ "/usr/local/include/boost/predef/platform/mingw.h"
+ "/usr/local/include/boost/predef/platform/windows_desktop.h"
+ "/usr/local/include/boost/predef/platform/windows_phone.h"
+ "/usr/local/include/boost/predef/platform/windows_runtime.h"
+ "/usr/local/include/boost/predef/platform/windows_store.h"
+ "/usr/local/include/boost/predef/version_number.h"
+ "/usr/local/include/boost/shared_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
+ "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
+ "/usr/local/include/boost/throw_exception.hpp"
+ "/usr/local/include/gflags/gflags.h"
+ "/usr/local/include/gflags/gflags_declare.h"
+ "/usr/local/include/glog/log_severity.h"
+ "/usr/local/include/glog/logging.h"
+ "/usr/local/include/glog/vlog_is_on.h"
+)
+
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
new file mode 100644
index 00000000..0bd0d4e9
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
new file mode 100644
index 00000000..2dfb589a
--- /dev/null
+++ b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
@@ -0,0 +1,744 @@
+# Generated by: make2cmake.cmake
+SET(CUDA_NVCC_DEPEND
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
+ "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu"
+ "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
+ "/opt/clBLAS-private-april8/include/clBLAS.h"
+ "/usr/include/_G_config.h"
+ "/usr/include/alloca.h"
+ "/usr/include/asm-generic/errno-base.h"
+ "/usr/include/asm-generic/errno.h"
+ "/usr/include/assert.h"
+ "/usr/include/atlas/cblas.h"
+ "/usr/include/c++/4.8/algorithm"
+ "/usr/include/c++/4.8/backward/auto_ptr.h"
+ "/usr/include/c++/4.8/backward/binders.h"
+ "/usr/include/c++/4.8/bits/algorithmfwd.h"
+ "/usr/include/c++/4.8/bits/allocator.h"
+ "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
+ "/usr/include/c++/4.8/bits/basic_ios.h"
+ "/usr/include/c++/4.8/bits/basic_ios.tcc"
+ "/usr/include/c++/4.8/bits/basic_string.h"
+ "/usr/include/c++/4.8/bits/basic_string.tcc"
+ "/usr/include/c++/4.8/bits/char_traits.h"
+ "/usr/include/c++/4.8/bits/codecvt.h"
+ "/usr/include/c++/4.8/bits/concept_check.h"
+ "/usr/include/c++/4.8/bits/cpp_type_traits.h"
+ "/usr/include/c++/4.8/bits/cxxabi_forced.h"
+ "/usr/include/c++/4.8/bits/exception_defines.h"
+ "/usr/include/c++/4.8/bits/fstream.tcc"
+ "/usr/include/c++/4.8/bits/functexcept.h"
+ "/usr/include/c++/4.8/bits/ios_base.h"
+ "/usr/include/c++/4.8/bits/istream.tcc"
+ "/usr/include/c++/4.8/bits/locale_classes.h"
+ "/usr/include/c++/4.8/bits/locale_classes.tcc"
+ "/usr/include/c++/4.8/bits/locale_facets.h"
+ "/usr/include/c++/4.8/bits/locale_facets.tcc"
+ "/usr/include/c++/4.8/bits/localefwd.h"
+ "/usr/include/c++/4.8/bits/memoryfwd.h"
+ "/usr/include/c++/4.8/bits/move.h"
+ "/usr/include/c++/4.8/bits/ostream.tcc"
+ "/usr/include/c++/4.8/bits/ostream_insert.h"
+ "/usr/include/c++/4.8/bits/postypes.h"
+ "/usr/include/c++/4.8/bits/range_access.h"
+ "/usr/include/c++/4.8/bits/sstream.tcc"
+ "/usr/include/c++/4.8/bits/stl_algo.h"
+ "/usr/include/c++/4.8/bits/stl_algobase.h"
+ "/usr/include/c++/4.8/bits/stl_bvector.h"
+ "/usr/include/c++/4.8/bits/stl_construct.h"
+ "/usr/include/c++/4.8/bits/stl_function.h"
+ "/usr/include/c++/4.8/bits/stl_heap.h"
+ "/usr/include/c++/4.8/bits/stl_iterator.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
+ "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
+ "/usr/include/c++/4.8/bits/stl_map.h"
+ "/usr/include/c++/4.8/bits/stl_multimap.h"
+ "/usr/include/c++/4.8/bits/stl_multiset.h"
+ "/usr/include/c++/4.8/bits/stl_pair.h"
+ "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
+ "/usr/include/c++/4.8/bits/stl_relops.h"
+ "/usr/include/c++/4.8/bits/stl_set.h"
+ "/usr/include/c++/4.8/bits/stl_tempbuf.h"
+ "/usr/include/c++/4.8/bits/stl_tree.h"
+ "/usr/include/c++/4.8/bits/stl_uninitialized.h"
+ "/usr/include/c++/4.8/bits/stl_vector.h"
+ "/usr/include/c++/4.8/bits/stream_iterator.h"
+ "/usr/include/c++/4.8/bits/streambuf.tcc"
+ "/usr/include/c++/4.8/bits/streambuf_iterator.h"
+ "/usr/include/c++/4.8/bits/stringfwd.h"
+ "/usr/include/c++/4.8/bits/vector.tcc"
+ "/usr/include/c++/4.8/cctype"
+ "/usr/include/c++/4.8/climits"
+ "/usr/include/c++/4.8/clocale"
+ "/usr/include/c++/4.8/cmath"
+ "/usr/include/c++/4.8/cstddef"
+ "/usr/include/c++/4.8/cstdio"
+ "/usr/include/c++/4.8/cstdlib"
+ "/usr/include/c++/4.8/cstring"
+ "/usr/include/c++/4.8/cwchar"
+ "/usr/include/c++/4.8/cwctype"
+ "/usr/include/c++/4.8/cxxabi.h"
+ "/usr/include/c++/4.8/debug/debug.h"
+ "/usr/include/c++/4.8/exception"
+ "/usr/include/c++/4.8/ext/alloc_traits.h"
+ "/usr/include/c++/4.8/ext/atomicity.h"
+ "/usr/include/c++/4.8/ext/new_allocator.h"
+ "/usr/include/c++/4.8/ext/numeric_traits.h"
+ "/usr/include/c++/4.8/ext/type_traits.h"
+ "/usr/include/c++/4.8/fstream"
+ "/usr/include/c++/4.8/functional"
+ "/usr/include/c++/4.8/ios"
+ "/usr/include/c++/4.8/iosfwd"
+ "/usr/include/c++/4.8/iostream"
+ "/usr/include/c++/4.8/istream"
+ "/usr/include/c++/4.8/iterator"
+ "/usr/include/c++/4.8/limits"
+ "/usr/include/c++/4.8/map"
+ "/usr/include/c++/4.8/memory"
+ "/usr/include/c++/4.8/new"
+ "/usr/include/c++/4.8/ostream"
+ "/usr/include/c++/4.8/set"
+ "/usr/include/c++/4.8/sstream"
+ "/usr/include/c++/4.8/stdexcept"
+ "/usr/include/c++/4.8/streambuf"
+ "/usr/include/c++/4.8/string"
+ "/usr/include/c++/4.8/typeinfo"
+ "/usr/include/c++/4.8/utility"
+ "/usr/include/c++/4.8/vector"
+ "/usr/include/ctype.h"
+ "/usr/include/endian.h"
+ "/usr/include/errno.h"
+ "/usr/include/features.h"
+ "/usr/include/getopt.h"
+ "/usr/include/inttypes.h"
+ "/usr/include/libio.h"
+ "/usr/include/limits.h"
+ "/usr/include/linux/errno.h"
+ "/usr/include/linux/limits.h"
+ "/usr/include/locale.h"
+ "/usr/include/math.h"
+ "/usr/include/pthread.h"
+ "/usr/include/sched.h"
+ "/usr/include/stdc-predef.h"
+ "/usr/include/stdint.h"
+ "/usr/include/stdio.h"
+ "/usr/include/stdlib.h"
+ "/usr/include/string.h"
+ "/usr/include/time.h"
+ "/usr/include/unistd.h"
+ "/usr/include/wchar.h"
+ "/usr/include/wctype.h"
+ "/usr/include/x86_64-linux-gnu/asm/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
+ "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
+ "/usr/include/x86_64-linux-gnu/bits/confname.h"
+ "/usr/include/x86_64-linux-gnu/bits/endian.h"
+ "/usr/include/x86_64-linux-gnu/bits/environments.h"
+ "/usr/include/x86_64-linux-gnu/bits/errno.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
+ "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
+ "/usr/include/x86_64-linux-gnu/bits/inf.h"
+ "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/locale.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
+ "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
+ "/usr/include/x86_64-linux-gnu/bits/nan.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
+ "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
+ "/usr/include/x86_64-linux-gnu/bits/sched.h"
+ "/usr/include/x86_64-linux-gnu/bits/select.h"
+ "/usr/include/x86_64-linux-gnu/bits/select2.h"
+ "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
+ "/usr/include/x86_64-linux-gnu/bits/sigset.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
+ "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
+ "/usr/include/x86_64-linux-gnu/bits/string3.h"
+ "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
+ "/usr/include/x86_64-linux-gnu/bits/time.h"
+ "/usr/include/x86_64-linux-gnu/bits/timex.h"
+ "/usr/include/x86_64-linux-gnu/bits/types.h"
+ "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
+ "/usr/include/x86_64-linux-gnu/bits/unistd.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
+ "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar.h"
+ "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
+ "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
+ "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
+ "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
+ "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
+ "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
+ "/usr/include/x86_64-linux-gnu/sys/select.h"
+ "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
+ "/usr/include/x86_64-linux-gnu/sys/types.h"
+ "/usr/include/xlocale.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
+ "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
+ "/usr/local/cuda-6.5/include/CL/cl.h"
+ "/usr/local/cuda-6.5/include/CL/cl_ext.h"
+ "/usr/local/cuda-6.5/include/CL/cl_platform.h"
+ "/usr/local/cuda-6.5/include/builtin_types.h"
+ "/usr/local/cuda-6.5/include/channel_descriptor.h"
+ "/usr/local/cuda-6.5/include/common_functions.h"
+ "/usr/local/cuda-6.5/include/cuComplex.h"
+ "/usr/local/cuda-6.5/include/cublas_api.h"
+ "/usr/local/cuda-6.5/include/cublas_v2.h"
+ "/usr/local/cuda-6.5/include/cuda.h"
+ "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime.h"
+ "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
+ "/usr/local/cuda-6.5/include/cuda_surface_types.h"
+ "/usr/local/cuda-6.5/include/cuda_texture_types.h"
+ "/usr/local/cuda-6.5/include/curand.h"
+ "/usr/local/cuda-6.5/include/device_functions.h"
+ "/usr/local/cuda-6.5/include/device_launch_parameters.h"
+ "/usr/local/cuda-6.5/include/device_types.h"
+ "/usr/local/cuda-6.5/include/driver_functions.h"
+ "/usr/local/cuda-6.5/include/driver_types.h"
+ "/usr/local/cuda-6.5/include/host_config.h"
+ "/usr/local/cuda-6.5/include/host_defines.h"
+ "/usr/local/cuda-6.5/include/math_functions.h"
+ "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
+ "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
+ "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
+ "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
+ "/usr/local/cuda-6.5/include/surface_functions.h"
+ "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/surface_types.h"
+ "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
+ "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
+ "/usr/local/cuda-6.5/include/texture_types.h"
+ "/usr/local/cuda-6.5/include/thrust/advance.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/advance.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/no_throw_allocator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/config.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/compiler.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/compiler_fence.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/config.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/debug.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/device_system.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/forceinline.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/global_workarounds.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/hd_warning_disable.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/host_device.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/host_system.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/config/simple_defines.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/copy.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/cstdint.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/device_free.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/device_malloc.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/device_ptr.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/device_reference.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/device_vector.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/dispatch/is_trivial_copy.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/distance.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/equal.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/execution_policy.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/extrema.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/fill.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/find.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/for_each.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/function.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/argument.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/composite.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/arithmetic_operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/assignment_operator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/bitwise_operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/compound_assignment_operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/logical_operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/operator_adaptors.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/relational_operators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/placeholder.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/functional/value.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/generate.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/host_vector.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/internal_functional.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/malloc_and_free.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/minmax.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/mismatch.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/numeric_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/overlapped_copy.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/pair.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/pointer.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/pointer.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/raw_pointer_cast.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/reduce.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/reference.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/reference.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/reference_forward_declaration.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/replace.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/scan.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/scatter.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/static_assert.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/swap.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/swap.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/swap_ranges.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/temporary_buffer.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/transform.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/transform_reduce.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/tuple.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/tuple_meta_transform.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/tuple_transform.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/function_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_member_function.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_nested_type.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_trivial_assign.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_call_possible.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_metafunction_defined.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_discard_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_output_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/minimum_type.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/pointer_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/type_traits/result_of.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/uninitialized_fill.inl"
+ "/usr/local/cuda-6.5/include/thrust/detail/use_default.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/util/align.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/util/blocking.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/vector_base.h"
+ "/usr/local/cuda-6.5/include/thrust/detail/vector_base.inl"
+ "/usr/local/cuda-6.5/include/thrust/device_free.h"
+ "/usr/local/cuda-6.5/include/thrust/device_malloc.h"
+ "/usr/local/cuda-6.5/include/thrust/device_malloc_allocator.h"
+ "/usr/local/cuda-6.5/include/thrust/device_ptr.h"
+ "/usr/local/cuda-6.5/include/thrust/device_reference.h"
+ "/usr/local/cuda-6.5/include/thrust/device_vector.h"
+ "/usr/local/cuda-6.5/include/thrust/distance.h"
+ "/usr/local/cuda-6.5/include/thrust/equal.h"
+ "/usr/local/cuda-6.5/include/thrust/extrema.h"
+ "/usr/local/cuda-6.5/include/thrust/fill.h"
+ "/usr/local/cuda-6.5/include/thrust/find.h"
+ "/usr/local/cuda-6.5/include/thrust/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/functional.h"
+ "/usr/local/cuda-6.5/include/thrust/generate.h"
+ "/usr/local/cuda-6.5/include/thrust/host_vector.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/counting_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_assign.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_system_tag.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/counting_iterator.inl"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/device_system_tag.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/discard_iterator_base.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/distance_from_result.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/host_system_tag.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_iterator_category.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_trivial_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_adaptor_base.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_system.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_traversal.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_facade_category.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traits.inl"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traversal_tags.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_category.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_system.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/normal_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/permutation_iterator_base.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator.inl"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator_base.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/tagged_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/transform_iterator.inl"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/tuple_of_iterator_references.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/universal_categories.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator.inl"
+ "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator_base.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/discard_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/iterator_adaptor.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/iterator_categories.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/iterator_facade.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/iterator_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/permutation_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/reverse_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/transform_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/iterator/zip_iterator.h"
+ "/usr/local/cuda-6.5/include/thrust/memory.h"
+ "/usr/local/cuda-6.5/include/thrust/mismatch.h"
+ "/usr/local/cuda-6.5/include/thrust/pair.h"
+ "/usr/local/cuda-6.5/include/thrust/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/replace.h"
+ "/usr/local/cuda-6.5/include/thrust/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/scatter.h"
+ "/usr/local/cuda-6.5/include/thrust/swap.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/assign_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/execution_policy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/extrema.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/find.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/generate.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/get_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/iter_swap.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/malloc_and_free.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/swap_ranges.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/temporary_buffer.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/transform.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/assign_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/inclusive_scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/cuda_launch_config.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/alignment.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/uninitialized.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/error.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/execution_policy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/extern_shared_ptr.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/get_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/iter_swap.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/malloc_and_free.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/swap_ranges.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/transform.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/cuda/error.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/assign_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/equal.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/extrema.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/fill.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/find.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/generate.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/get_value.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/iter_swap.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/malloc_and_free.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/mismatch.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/replace.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scatter.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/swap_ranges.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/temporary_buffer.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform_reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/adl/uninitialized_fill.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/bad_alloc.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/errno.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/error_category.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/error_code.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/error_condition.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/fill.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/select_system.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/tag.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/type_traits.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/decompose.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/extrema.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/find.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/for_each.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/general_copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan_by_key.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/trivial_copy.h"
+ "/usr/local/cuda-6.5/include/thrust/system/detail/system_error.inl"
+ "/usr/local/cuda-6.5/include/thrust/system/error_code.h"
+ "/usr/local/cuda-6.5/include/thrust/system/system_error.h"
+ "/usr/local/cuda-6.5/include/thrust/system_error.h"
+ "/usr/local/cuda-6.5/include/thrust/transform.h"
+ "/usr/local/cuda-6.5/include/thrust/transform_reduce.h"
+ "/usr/local/cuda-6.5/include/thrust/tuple.h"
+ "/usr/local/cuda-6.5/include/thrust/uninitialized_fill.h"
+ "/usr/local/cuda-6.5/include/vector_functions.h"
+ "/usr/local/cuda-6.5/include/vector_types.h"
+ "/usr/local/include/boost/assert.hpp"
+ "/usr/local/include/boost/checked_delete.hpp"
+ "/usr/local/include/boost/config.hpp"
+ "/usr/local/include/boost/config/compiler/gcc.hpp"
+ "/usr/local/include/boost/config/compiler/nvcc.hpp"
+ "/usr/local/include/boost/config/no_tr1/memory.hpp"
+ "/usr/local/include/boost/config/no_tr1/utility.hpp"
+ "/usr/local/include/boost/config/platform/linux.hpp"
+ "/usr/local/include/boost/config/posix_features.hpp"
+ "/usr/local/include/boost/config/select_compiler_config.hpp"
+ "/usr/local/include/boost/config/select_platform_config.hpp"
+ "/usr/local/include/boost/config/select_stdlib_config.hpp"
+ "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
+ "/usr/local/include/boost/config/suffix.hpp"
+ "/usr/local/include/boost/config/user.hpp"
+ "/usr/local/include/boost/core/checked_delete.hpp"
+ "/usr/local/include/boost/core/demangle.hpp"
+ "/usr/local/include/boost/core/typeinfo.hpp"
+ "/usr/local/include/boost/current_function.hpp"
+ "/usr/local/include/boost/detail/sp_typeinfo.hpp"
+ "/usr/local/include/boost/detail/workaround.hpp"
+ "/usr/local/include/boost/exception/exception.hpp"
+ "/usr/local/include/boost/predef.h"
+ "/usr/local/include/boost/predef/architecture.h"
+ "/usr/local/include/boost/predef/architecture/alpha.h"
+ "/usr/local/include/boost/predef/architecture/arm.h"
+ "/usr/local/include/boost/predef/architecture/blackfin.h"
+ "/usr/local/include/boost/predef/architecture/convex.h"
+ "/usr/local/include/boost/predef/architecture/ia64.h"
+ "/usr/local/include/boost/predef/architecture/m68k.h"
+ "/usr/local/include/boost/predef/architecture/mips.h"
+ "/usr/local/include/boost/predef/architecture/parisc.h"
+ "/usr/local/include/boost/predef/architecture/ppc.h"
+ "/usr/local/include/boost/predef/architecture/pyramid.h"
+ "/usr/local/include/boost/predef/architecture/rs6k.h"
+ "/usr/local/include/boost/predef/architecture/sparc.h"
+ "/usr/local/include/boost/predef/architecture/superh.h"
+ "/usr/local/include/boost/predef/architecture/sys370.h"
+ "/usr/local/include/boost/predef/architecture/sys390.h"
+ "/usr/local/include/boost/predef/architecture/x86.h"
+ "/usr/local/include/boost/predef/architecture/x86/32.h"
+ "/usr/local/include/boost/predef/architecture/x86/64.h"
+ "/usr/local/include/boost/predef/architecture/z.h"
+ "/usr/local/include/boost/predef/compiler.h"
+ "/usr/local/include/boost/predef/compiler/borland.h"
+ "/usr/local/include/boost/predef/compiler/clang.h"
+ "/usr/local/include/boost/predef/compiler/comeau.h"
+ "/usr/local/include/boost/predef/compiler/compaq.h"
+ "/usr/local/include/boost/predef/compiler/diab.h"
+ "/usr/local/include/boost/predef/compiler/digitalmars.h"
+ "/usr/local/include/boost/predef/compiler/dignus.h"
+ "/usr/local/include/boost/predef/compiler/edg.h"
+ "/usr/local/include/boost/predef/compiler/ekopath.h"
+ "/usr/local/include/boost/predef/compiler/gcc.h"
+ "/usr/local/include/boost/predef/compiler/gcc_xml.h"
+ "/usr/local/include/boost/predef/compiler/greenhills.h"
+ "/usr/local/include/boost/predef/compiler/hp_acc.h"
+ "/usr/local/include/boost/predef/compiler/iar.h"
+ "/usr/local/include/boost/predef/compiler/ibm.h"
+ "/usr/local/include/boost/predef/compiler/intel.h"
+ "/usr/local/include/boost/predef/compiler/kai.h"
+ "/usr/local/include/boost/predef/compiler/llvm.h"
+ "/usr/local/include/boost/predef/compiler/metaware.h"
+ "/usr/local/include/boost/predef/compiler/metrowerks.h"
+ "/usr/local/include/boost/predef/compiler/microtec.h"
+ "/usr/local/include/boost/predef/compiler/mpw.h"
+ "/usr/local/include/boost/predef/compiler/palm.h"
+ "/usr/local/include/boost/predef/compiler/pgi.h"
+ "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
+ "/usr/local/include/boost/predef/compiler/sunpro.h"
+ "/usr/local/include/boost/predef/compiler/tendra.h"
+ "/usr/local/include/boost/predef/compiler/visualc.h"
+ "/usr/local/include/boost/predef/compiler/watcom.h"
+ "/usr/local/include/boost/predef/detail/_cassert.h"
+ "/usr/local/include/boost/predef/detail/_exception.h"
+ "/usr/local/include/boost/predef/detail/comp_detected.h"
+ "/usr/local/include/boost/predef/detail/os_detected.h"
+ "/usr/local/include/boost/predef/detail/test.h"
+ "/usr/local/include/boost/predef/language.h"
+ "/usr/local/include/boost/predef/language/objc.h"
+ "/usr/local/include/boost/predef/language/stdc.h"
+ "/usr/local/include/boost/predef/language/stdcpp.h"
+ "/usr/local/include/boost/predef/library.h"
+ "/usr/local/include/boost/predef/library/c.h"
+ "/usr/local/include/boost/predef/library/c/_prefix.h"
+ "/usr/local/include/boost/predef/library/c/gnu.h"
+ "/usr/local/include/boost/predef/library/c/uc.h"
+ "/usr/local/include/boost/predef/library/c/vms.h"
+ "/usr/local/include/boost/predef/library/c/zos.h"
+ "/usr/local/include/boost/predef/library/std.h"
+ "/usr/local/include/boost/predef/library/std/_prefix.h"
+ "/usr/local/include/boost/predef/library/std/cxx.h"
+ "/usr/local/include/boost/predef/library/std/dinkumware.h"
+ "/usr/local/include/boost/predef/library/std/libcomo.h"
+ "/usr/local/include/boost/predef/library/std/modena.h"
+ "/usr/local/include/boost/predef/library/std/msl.h"
+ "/usr/local/include/boost/predef/library/std/roguewave.h"
+ "/usr/local/include/boost/predef/library/std/sgi.h"
+ "/usr/local/include/boost/predef/library/std/stdcpp3.h"
+ "/usr/local/include/boost/predef/library/std/stlport.h"
+ "/usr/local/include/boost/predef/library/std/vacpp.h"
+ "/usr/local/include/boost/predef/make.h"
+ "/usr/local/include/boost/predef/os.h"
+ "/usr/local/include/boost/predef/os/aix.h"
+ "/usr/local/include/boost/predef/os/amigaos.h"
+ "/usr/local/include/boost/predef/os/android.h"
+ "/usr/local/include/boost/predef/os/beos.h"
+ "/usr/local/include/boost/predef/os/bsd.h"
+ "/usr/local/include/boost/predef/os/bsd/bsdi.h"
+ "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
+ "/usr/local/include/boost/predef/os/bsd/free.h"
+ "/usr/local/include/boost/predef/os/bsd/net.h"
+ "/usr/local/include/boost/predef/os/bsd/open.h"
+ "/usr/local/include/boost/predef/os/cygwin.h"
+ "/usr/local/include/boost/predef/os/hpux.h"
+ "/usr/local/include/boost/predef/os/ios.h"
+ "/usr/local/include/boost/predef/os/irix.h"
+ "/usr/local/include/boost/predef/os/linux.h"
+ "/usr/local/include/boost/predef/os/macos.h"
+ "/usr/local/include/boost/predef/os/os400.h"
+ "/usr/local/include/boost/predef/os/qnxnto.h"
+ "/usr/local/include/boost/predef/os/solaris.h"
+ "/usr/local/include/boost/predef/os/unix.h"
+ "/usr/local/include/boost/predef/os/vms.h"
+ "/usr/local/include/boost/predef/os/windows.h"
+ "/usr/local/include/boost/predef/other.h"
+ "/usr/local/include/boost/predef/other/endian.h"
+ "/usr/local/include/boost/predef/platform.h"
+ "/usr/local/include/boost/predef/platform/mingw.h"
+ "/usr/local/include/boost/predef/platform/windows_desktop.h"
+ "/usr/local/include/boost/predef/platform/windows_phone.h"
+ "/usr/local/include/boost/predef/platform/windows_runtime.h"
+ "/usr/local/include/boost/predef/platform/windows_store.h"
+ "/usr/local/include/boost/predef/version_number.h"
+ "/usr/local/include/boost/shared_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
+ "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
+ "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
+ "/usr/local/include/boost/throw_exception.hpp"
+ "/usr/local/include/gflags/gflags.h"
+ "/usr/local/include/gflags/gflags_declare.h"
+ "/usr/local/include/glog/log_severity.h"
+ "/usr/local/include/glog/logging.h"
+ "/usr/local/include/glog/vlog_is_on.h"
+)
+
diff --git a/src/caffe/CMakeFiles/progress.marks b/src/caffe/CMakeFiles/progress.marks
new file mode 100644
index 00000000..abdfb053
--- /dev/null
+++ b/src/caffe/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+60
diff --git a/src/caffe/CMakeFiles/proto.dir/CXX.includecache b/src/caffe/CMakeFiles/proto.dir/CXX.includecache
new file mode 100644
index 00000000..df68b9a9
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/CXX.includecache
@@ -0,0 +1,48 @@
+#IncludeRegexLine: ^[ 	]*#[ 	]*(include|import)[ 	]*[<"]([^">]+)([">])
+
+#IncludeRegexScan: ^.*$
+
+#IncludeRegexComplain: ^$
+
+#IncludeRegexTransform: 
+
+/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
+caffe.pb.h
+/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
+algorithm
+-
+google/protobuf/stubs/common.h
+-
+google/protobuf/stubs/once.h
+-
+google/protobuf/io/coded_stream.h
+-
+google/protobuf/wire_format_lite_inl.h
+-
+google/protobuf/descriptor.h
+-
+google/protobuf/generated_message_reflection.h
+-
+google/protobuf/reflection_ops.h
+-
+google/protobuf/wire_format.h
+-
+
+/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
+string
+-
+google/protobuf/stubs/common.h
+-
+google/protobuf/generated_message_util.h
+-
+google/protobuf/message.h
+-
+google/protobuf/repeated_field.h
+-
+google/protobuf/extension_set.h
+-
+google/protobuf/generated_enum_reflection.h
+-
+google/protobuf/unknown_field_set.h
+-
+
diff --git a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
new file mode 100644
index 00000000..44c81e52
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
@@ -0,0 +1,39 @@
+# The set of languages for which implicit dependencies are needed:
+SET(CMAKE_DEPENDS_LANGUAGES
+  "CXX"
+  )
+# The set of files for implicit dependencies of each language:
+SET(CMAKE_DEPENDS_CHECK_CXX
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
+  )
+SET(CMAKE_CXX_COMPILER_ID "GNU")
+
+# Preprocessor definitions for this target.
+SET(CMAKE_TARGET_DEFINITIONS
+  "GTEST_USE_OWN_TR1_TUPLE"
+  )
+
+# Pairs of files generated by the same build rule.
+SET(CMAKE_MULTIPLE_OUTPUT_PAIRS
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc"
+  )
+
+
+# Targets to which this target links.
+SET(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# The include file search paths:
+SET(CMAKE_C_TARGET_INCLUDE_PATH
+  "src"
+  "/usr/local/include"
+  "include"
+  "/usr/local/cuda/include"
+  "/usr/local/include/opencv"
+  "/usr/include/atlas"
+  "."
+  )
+SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/CMakeFiles/proto.dir/build.make b/src/caffe/CMakeFiles/proto.dir/build.make
new file mode 100644
index 00000000..1467c124
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/build.make
@@ -0,0 +1,119 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# Include any dependencies generated for this target.
+include src/caffe/CMakeFiles/proto.dir/depend.make
+
+# Include the progress variables for this target.
+include src/caffe/CMakeFiles/proto.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include src/caffe/CMakeFiles/proto.dir/flags.make
+
+include/caffe/proto/caffe.pb.cc: src/caffe/proto/caffe.proto
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Running C++/Python protocol buffer compiler on /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --cpp_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --python_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto
+
+include/caffe/proto/caffe.pb.h: include/caffe/proto/caffe.pb.cc
+
+include/caffe/proto/caffe_pb2.py: include/caffe/proto/caffe.pb.cc
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: src/caffe/CMakeFiles/proto.dir/flags.make
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc > CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires:
+.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
+	$(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build
+.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
+
+# Object files for target proto
+proto_OBJECTS = \
+"CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
+
+# External object files for target proto
+proto_EXTERNAL_OBJECTS =
+
+lib/libproto.a: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
+lib/libproto.a: src/caffe/CMakeFiles/proto.dir/build.make
+lib/libproto.a: src/caffe/CMakeFiles/proto.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libproto.a"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean_target.cmake
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/proto.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+src/caffe/CMakeFiles/proto.dir/build: lib/libproto.a
+.PHONY : src/caffe/CMakeFiles/proto.dir/build
+
+src/caffe/CMakeFiles/proto.dir/requires: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
+.PHONY : src/caffe/CMakeFiles/proto.dir/requires
+
+src/caffe/CMakeFiles/proto.dir/clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean.cmake
+.PHONY : src/caffe/CMakeFiles/proto.dir/clean
+
+src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.cc
+src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.h
+src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe_pb2.py
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/caffe/CMakeFiles/proto.dir/depend
+
diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
new file mode 100644
index 00000000..79cb425a
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
@@ -0,0 +1,13 @@
+FILE(REMOVE_RECURSE
+  "../../include/caffe/proto/caffe.pb.cc"
+  "../../include/caffe/proto/caffe.pb.h"
+  "../../include/caffe/proto/caffe_pb2.py"
+  "CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
+  "../../lib/libproto.pdb"
+  "../../lib/libproto.a"
+)
+
+# Per-language clean rules from dependency scanning.
+FOREACH(lang CXX)
+  INCLUDE(CMakeFiles/proto.dir/cmake_clean_${lang}.cmake OPTIONAL)
+ENDFOREACH(lang)
diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
new file mode 100644
index 00000000..6172b692
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
@@ -0,0 +1,3 @@
+FILE(REMOVE_RECURSE
+  "../../lib/libproto.a"
+)
diff --git a/src/caffe/CMakeFiles/proto.dir/depend.internal b/src/caffe/CMakeFiles/proto.dir/depend.internal
new file mode 100644
index 00000000..2f8ec677
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/depend.internal
@@ -0,0 +1,6 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
+ /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
+ /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
diff --git a/src/caffe/CMakeFiles/proto.dir/depend.make b/src/caffe/CMakeFiles/proto.dir/depend.make
new file mode 100644
index 00000000..239c4242
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/depend.make
@@ -0,0 +1,6 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc
+src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.h
+
diff --git a/src/caffe/CMakeFiles/proto.dir/flags.make b/src/caffe/CMakeFiles/proto.dir/flags.make
new file mode 100644
index 00000000..8b4ef992
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/flags.make
@@ -0,0 +1,8 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# compile CXX with /usr/bin/c++
+CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
+
+CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
+
diff --git a/src/caffe/CMakeFiles/proto.dir/link.txt b/src/caffe/CMakeFiles/proto.dir/link.txt
new file mode 100644
index 00000000..42f85bda
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/link.txt
@@ -0,0 +1,2 @@
+/usr/bin/ar cr ../../lib/libproto.a  CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
+/usr/bin/ranlib ../../lib/libproto.a
diff --git a/src/caffe/CMakeFiles/proto.dir/progress.make b/src/caffe/CMakeFiles/proto.dir/progress.make
new file mode 100644
index 00000000..25d32761
--- /dev/null
+++ b/src/caffe/CMakeFiles/proto.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 67
+CMAKE_PROGRESS_2 = 
+
diff --git a/src/caffe/Makefile b/src/caffe/Makefile
new file mode 100644
index 00000000..fff490de
--- /dev/null
+++ b/src/caffe/Makefile
@@ -0,0 +1,2279 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: install/local
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: install/strip
+.PHONY : install/strip/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/progress.marks
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/caffe/CMakeFiles/caffe.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/caffe.dir/rule
+.PHONY : src/caffe/CMakeFiles/caffe.dir/rule
+
+# Convenience name for target.
+caffe: src/caffe/CMakeFiles/caffe.dir/rule
+.PHONY : caffe
+
+# fast build rule for target.
+caffe/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/build
+.PHONY : caffe/fast
+
+# Convenience name for target.
+src/caffe/CMakeFiles/proto.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/proto.dir/rule
+.PHONY : src/caffe/CMakeFiles/proto.dir/rule
+
+# Convenience name for target.
+proto: src/caffe/CMakeFiles/proto.dir/rule
+.PHONY : proto
+
+# fast build rule for target.
+proto/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/build
+.PHONY : proto/fast
+
+__/__/include/caffe/proto/caffe.pb.o: __/__/include/caffe/proto/caffe.pb.cc.o
+.PHONY : __/__/include/caffe/proto/caffe.pb.o
+
+# target to build an object file
+__/__/include/caffe/proto/caffe.pb.cc.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
+.PHONY : __/__/include/caffe/proto/caffe.pb.cc.o
+
+__/__/include/caffe/proto/caffe.pb.i: __/__/include/caffe/proto/caffe.pb.cc.i
+.PHONY : __/__/include/caffe/proto/caffe.pb.i
+
+# target to preprocess a source file
+__/__/include/caffe/proto/caffe.pb.cc.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i
+.PHONY : __/__/include/caffe/proto/caffe.pb.cc.i
+
+__/__/include/caffe/proto/caffe.pb.s: __/__/include/caffe/proto/caffe.pb.cc.s
+.PHONY : __/__/include/caffe/proto/caffe.pb.s
+
+# target to generate assembly for a file
+__/__/include/caffe/proto/caffe.pb.cc.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s
+.PHONY : __/__/include/caffe/proto/caffe.pb.cc.s
+
+blob.o: blob.cpp.o
+.PHONY : blob.o
+
+# target to build an object file
+blob.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
+.PHONY : blob.cpp.o
+
+blob.i: blob.cpp.i
+.PHONY : blob.i
+
+# target to preprocess a source file
+blob.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.i
+.PHONY : blob.cpp.i
+
+blob.s: blob.cpp.s
+.PHONY : blob.s
+
+# target to generate assembly for a file
+blob.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.s
+.PHONY : blob.cpp.s
+
+common.o: common.cpp.o
+.PHONY : common.o
+
+# target to build an object file
+common.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o
+.PHONY : common.cpp.o
+
+common.i: common.cpp.i
+.PHONY : common.i
+
+# target to preprocess a source file
+common.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.i
+.PHONY : common.cpp.i
+
+common.s: common.cpp.s
+.PHONY : common.s
+
+# target to generate assembly for a file
+common.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.s
+.PHONY : common.cpp.s
+
+data_transformer.o: data_transformer.cpp.o
+.PHONY : data_transformer.o
+
+# target to build an object file
+data_transformer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
+.PHONY : data_transformer.cpp.o
+
+data_transformer.i: data_transformer.cpp.i
+.PHONY : data_transformer.i
+
+# target to preprocess a source file
+data_transformer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i
+.PHONY : data_transformer.cpp.i
+
+data_transformer.s: data_transformer.cpp.s
+.PHONY : data_transformer.s
+
+# target to generate assembly for a file
+data_transformer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s
+.PHONY : data_transformer.cpp.s
+
+device.o: device.cpp.o
+.PHONY : device.o
+
+# target to build an object file
+device.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o
+.PHONY : device.cpp.o
+
+device.i: device.cpp.i
+.PHONY : device.i
+
+# target to preprocess a source file
+device.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.i
+.PHONY : device.cpp.i
+
+device.s: device.cpp.s
+.PHONY : device.s
+
+# target to generate assembly for a file
+device.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.s
+.PHONY : device.cpp.s
+
+internal_thread.o: internal_thread.cpp.o
+.PHONY : internal_thread.o
+
+# target to build an object file
+internal_thread.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
+.PHONY : internal_thread.cpp.o
+
+internal_thread.i: internal_thread.cpp.i
+.PHONY : internal_thread.i
+
+# target to preprocess a source file
+internal_thread.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i
+.PHONY : internal_thread.cpp.i
+
+internal_thread.s: internal_thread.cpp.s
+.PHONY : internal_thread.s
+
+# target to generate assembly for a file
+internal_thread.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s
+.PHONY : internal_thread.cpp.s
+
+layer_factory.o: layer_factory.cpp.o
+.PHONY : layer_factory.o
+
+# target to build an object file
+layer_factory.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
+.PHONY : layer_factory.cpp.o
+
+layer_factory.i: layer_factory.cpp.i
+.PHONY : layer_factory.i
+
+# target to preprocess a source file
+layer_factory.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i
+.PHONY : layer_factory.cpp.i
+
+layer_factory.s: layer_factory.cpp.s
+.PHONY : layer_factory.s
+
+# target to generate assembly for a file
+layer_factory.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s
+.PHONY : layer_factory.cpp.s
+
+layers/absval_layer.o: layers/absval_layer.cpp.o
+.PHONY : layers/absval_layer.o
+
+# target to build an object file
+layers/absval_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
+.PHONY : layers/absval_layer.cpp.o
+
+layers/absval_layer.i: layers/absval_layer.cpp.i
+.PHONY : layers/absval_layer.i
+
+# target to preprocess a source file
+layers/absval_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i
+.PHONY : layers/absval_layer.cpp.i
+
+layers/absval_layer.s: layers/absval_layer.cpp.s
+.PHONY : layers/absval_layer.s
+
+# target to generate assembly for a file
+layers/absval_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s
+.PHONY : layers/absval_layer.cpp.s
+
+layers/accuracy_layer.o: layers/accuracy_layer.cpp.o
+.PHONY : layers/accuracy_layer.o
+
+# target to build an object file
+layers/accuracy_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
+.PHONY : layers/accuracy_layer.cpp.o
+
+layers/accuracy_layer.i: layers/accuracy_layer.cpp.i
+.PHONY : layers/accuracy_layer.i
+
+# target to preprocess a source file
+layers/accuracy_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i
+.PHONY : layers/accuracy_layer.cpp.i
+
+layers/accuracy_layer.s: layers/accuracy_layer.cpp.s
+.PHONY : layers/accuracy_layer.s
+
+# target to generate assembly for a file
+layers/accuracy_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s
+.PHONY : layers/accuracy_layer.cpp.s
+
+layers/argmax_layer.o: layers/argmax_layer.cpp.o
+.PHONY : layers/argmax_layer.o
+
+# target to build an object file
+layers/argmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
+.PHONY : layers/argmax_layer.cpp.o
+
+layers/argmax_layer.i: layers/argmax_layer.cpp.i
+.PHONY : layers/argmax_layer.i
+
+# target to preprocess a source file
+layers/argmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i
+.PHONY : layers/argmax_layer.cpp.i
+
+layers/argmax_layer.s: layers/argmax_layer.cpp.s
+.PHONY : layers/argmax_layer.s
+
+# target to generate assembly for a file
+layers/argmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s
+.PHONY : layers/argmax_layer.cpp.s
+
+layers/base_conv_layer.o: layers/base_conv_layer.cpp.o
+.PHONY : layers/base_conv_layer.o
+
+# target to build an object file
+layers/base_conv_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
+.PHONY : layers/base_conv_layer.cpp.o
+
+layers/base_conv_layer.i: layers/base_conv_layer.cpp.i
+.PHONY : layers/base_conv_layer.i
+
+# target to preprocess a source file
+layers/base_conv_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i
+.PHONY : layers/base_conv_layer.cpp.i
+
+layers/base_conv_layer.s: layers/base_conv_layer.cpp.s
+.PHONY : layers/base_conv_layer.s
+
+# target to generate assembly for a file
+layers/base_conv_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s
+.PHONY : layers/base_conv_layer.cpp.s
+
+layers/base_data_layer.o: layers/base_data_layer.cpp.o
+.PHONY : layers/base_data_layer.o
+
+# target to build an object file
+layers/base_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
+.PHONY : layers/base_data_layer.cpp.o
+
+layers/base_data_layer.i: layers/base_data_layer.cpp.i
+.PHONY : layers/base_data_layer.i
+
+# target to preprocess a source file
+layers/base_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i
+.PHONY : layers/base_data_layer.cpp.i
+
+layers/base_data_layer.s: layers/base_data_layer.cpp.s
+.PHONY : layers/base_data_layer.s
+
+# target to generate assembly for a file
+layers/base_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s
+.PHONY : layers/base_data_layer.cpp.s
+
+layers/bnll_layer.o: layers/bnll_layer.cpp.o
+.PHONY : layers/bnll_layer.o
+
+# target to build an object file
+layers/bnll_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
+.PHONY : layers/bnll_layer.cpp.o
+
+layers/bnll_layer.i: layers/bnll_layer.cpp.i
+.PHONY : layers/bnll_layer.i
+
+# target to preprocess a source file
+layers/bnll_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i
+.PHONY : layers/bnll_layer.cpp.i
+
+layers/bnll_layer.s: layers/bnll_layer.cpp.s
+.PHONY : layers/bnll_layer.s
+
+# target to generate assembly for a file
+layers/bnll_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s
+.PHONY : layers/bnll_layer.cpp.s
+
+layers/concat_layer.o: layers/concat_layer.cpp.o
+.PHONY : layers/concat_layer.o
+
+# target to build an object file
+layers/concat_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
+.PHONY : layers/concat_layer.cpp.o
+
+layers/concat_layer.i: layers/concat_layer.cpp.i
+.PHONY : layers/concat_layer.i
+
+# target to preprocess a source file
+layers/concat_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i
+.PHONY : layers/concat_layer.cpp.i
+
+layers/concat_layer.s: layers/concat_layer.cpp.s
+.PHONY : layers/concat_layer.s
+
+# target to generate assembly for a file
+layers/concat_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s
+.PHONY : layers/concat_layer.cpp.s
+
+layers/contrastive_loss_layer.o: layers/contrastive_loss_layer.cpp.o
+.PHONY : layers/contrastive_loss_layer.o
+
+# target to build an object file
+layers/contrastive_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
+.PHONY : layers/contrastive_loss_layer.cpp.o
+
+layers/contrastive_loss_layer.i: layers/contrastive_loss_layer.cpp.i
+.PHONY : layers/contrastive_loss_layer.i
+
+# target to preprocess a source file
+layers/contrastive_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i
+.PHONY : layers/contrastive_loss_layer.cpp.i
+
+layers/contrastive_loss_layer.s: layers/contrastive_loss_layer.cpp.s
+.PHONY : layers/contrastive_loss_layer.s
+
+# target to generate assembly for a file
+layers/contrastive_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s
+.PHONY : layers/contrastive_loss_layer.cpp.s
+
+layers/conv_layer.o: layers/conv_layer.cpp.o
+.PHONY : layers/conv_layer.o
+
+# target to build an object file
+layers/conv_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
+.PHONY : layers/conv_layer.cpp.o
+
+layers/conv_layer.i: layers/conv_layer.cpp.i
+.PHONY : layers/conv_layer.i
+
+# target to preprocess a source file
+layers/conv_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i
+.PHONY : layers/conv_layer.cpp.i
+
+layers/conv_layer.s: layers/conv_layer.cpp.s
+.PHONY : layers/conv_layer.s
+
+# target to generate assembly for a file
+layers/conv_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s
+.PHONY : layers/conv_layer.cpp.s
+
+layers/cudnn_conv_layer.o: layers/cudnn_conv_layer.cpp.o
+.PHONY : layers/cudnn_conv_layer.o
+
+# target to build an object file
+layers/cudnn_conv_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
+.PHONY : layers/cudnn_conv_layer.cpp.o
+
+layers/cudnn_conv_layer.i: layers/cudnn_conv_layer.cpp.i
+.PHONY : layers/cudnn_conv_layer.i
+
+# target to preprocess a source file
+layers/cudnn_conv_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i
+.PHONY : layers/cudnn_conv_layer.cpp.i
+
+layers/cudnn_conv_layer.s: layers/cudnn_conv_layer.cpp.s
+.PHONY : layers/cudnn_conv_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_conv_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s
+.PHONY : layers/cudnn_conv_layer.cpp.s
+
+layers/cudnn_pooling_layer.o: layers/cudnn_pooling_layer.cpp.o
+.PHONY : layers/cudnn_pooling_layer.o
+
+# target to build an object file
+layers/cudnn_pooling_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
+.PHONY : layers/cudnn_pooling_layer.cpp.o
+
+layers/cudnn_pooling_layer.i: layers/cudnn_pooling_layer.cpp.i
+.PHONY : layers/cudnn_pooling_layer.i
+
+# target to preprocess a source file
+layers/cudnn_pooling_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i
+.PHONY : layers/cudnn_pooling_layer.cpp.i
+
+layers/cudnn_pooling_layer.s: layers/cudnn_pooling_layer.cpp.s
+.PHONY : layers/cudnn_pooling_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_pooling_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s
+.PHONY : layers/cudnn_pooling_layer.cpp.s
+
+layers/cudnn_relu_layer.o: layers/cudnn_relu_layer.cpp.o
+.PHONY : layers/cudnn_relu_layer.o
+
+# target to build an object file
+layers/cudnn_relu_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
+.PHONY : layers/cudnn_relu_layer.cpp.o
+
+layers/cudnn_relu_layer.i: layers/cudnn_relu_layer.cpp.i
+.PHONY : layers/cudnn_relu_layer.i
+
+# target to preprocess a source file
+layers/cudnn_relu_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i
+.PHONY : layers/cudnn_relu_layer.cpp.i
+
+layers/cudnn_relu_layer.s: layers/cudnn_relu_layer.cpp.s
+.PHONY : layers/cudnn_relu_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_relu_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s
+.PHONY : layers/cudnn_relu_layer.cpp.s
+
+layers/cudnn_sigmoid_layer.o: layers/cudnn_sigmoid_layer.cpp.o
+.PHONY : layers/cudnn_sigmoid_layer.o
+
+# target to build an object file
+layers/cudnn_sigmoid_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
+.PHONY : layers/cudnn_sigmoid_layer.cpp.o
+
+layers/cudnn_sigmoid_layer.i: layers/cudnn_sigmoid_layer.cpp.i
+.PHONY : layers/cudnn_sigmoid_layer.i
+
+# target to preprocess a source file
+layers/cudnn_sigmoid_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i
+.PHONY : layers/cudnn_sigmoid_layer.cpp.i
+
+layers/cudnn_sigmoid_layer.s: layers/cudnn_sigmoid_layer.cpp.s
+.PHONY : layers/cudnn_sigmoid_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_sigmoid_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s
+.PHONY : layers/cudnn_sigmoid_layer.cpp.s
+
+layers/cudnn_softmax_layer.o: layers/cudnn_softmax_layer.cpp.o
+.PHONY : layers/cudnn_softmax_layer.o
+
+# target to build an object file
+layers/cudnn_softmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
+.PHONY : layers/cudnn_softmax_layer.cpp.o
+
+layers/cudnn_softmax_layer.i: layers/cudnn_softmax_layer.cpp.i
+.PHONY : layers/cudnn_softmax_layer.i
+
+# target to preprocess a source file
+layers/cudnn_softmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i
+.PHONY : layers/cudnn_softmax_layer.cpp.i
+
+layers/cudnn_softmax_layer.s: layers/cudnn_softmax_layer.cpp.s
+.PHONY : layers/cudnn_softmax_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_softmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s
+.PHONY : layers/cudnn_softmax_layer.cpp.s
+
+layers/cudnn_tanh_layer.o: layers/cudnn_tanh_layer.cpp.o
+.PHONY : layers/cudnn_tanh_layer.o
+
+# target to build an object file
+layers/cudnn_tanh_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
+.PHONY : layers/cudnn_tanh_layer.cpp.o
+
+layers/cudnn_tanh_layer.i: layers/cudnn_tanh_layer.cpp.i
+.PHONY : layers/cudnn_tanh_layer.i
+
+# target to preprocess a source file
+layers/cudnn_tanh_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i
+.PHONY : layers/cudnn_tanh_layer.cpp.i
+
+layers/cudnn_tanh_layer.s: layers/cudnn_tanh_layer.cpp.s
+.PHONY : layers/cudnn_tanh_layer.s
+
+# target to generate assembly for a file
+layers/cudnn_tanh_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s
+.PHONY : layers/cudnn_tanh_layer.cpp.s
+
+layers/data_layer.o: layers/data_layer.cpp.o
+.PHONY : layers/data_layer.o
+
+# target to build an object file
+layers/data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
+.PHONY : layers/data_layer.cpp.o
+
+layers/data_layer.i: layers/data_layer.cpp.i
+.PHONY : layers/data_layer.i
+
+# target to preprocess a source file
+layers/data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i
+.PHONY : layers/data_layer.cpp.i
+
+layers/data_layer.s: layers/data_layer.cpp.s
+.PHONY : layers/data_layer.s
+
+# target to generate assembly for a file
+layers/data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s
+.PHONY : layers/data_layer.cpp.s
+
+layers/deconv_layer.o: layers/deconv_layer.cpp.o
+.PHONY : layers/deconv_layer.o
+
+# target to build an object file
+layers/deconv_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
+.PHONY : layers/deconv_layer.cpp.o
+
+layers/deconv_layer.i: layers/deconv_layer.cpp.i
+.PHONY : layers/deconv_layer.i
+
+# target to preprocess a source file
+layers/deconv_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i
+.PHONY : layers/deconv_layer.cpp.i
+
+layers/deconv_layer.s: layers/deconv_layer.cpp.s
+.PHONY : layers/deconv_layer.s
+
+# target to generate assembly for a file
+layers/deconv_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s
+.PHONY : layers/deconv_layer.cpp.s
+
+layers/dropout_layer.o: layers/dropout_layer.cpp.o
+.PHONY : layers/dropout_layer.o
+
+# target to build an object file
+layers/dropout_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
+.PHONY : layers/dropout_layer.cpp.o
+
+layers/dropout_layer.i: layers/dropout_layer.cpp.i
+.PHONY : layers/dropout_layer.i
+
+# target to preprocess a source file
+layers/dropout_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i
+.PHONY : layers/dropout_layer.cpp.i
+
+layers/dropout_layer.s: layers/dropout_layer.cpp.s
+.PHONY : layers/dropout_layer.s
+
+# target to generate assembly for a file
+layers/dropout_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s
+.PHONY : layers/dropout_layer.cpp.s
+
+layers/dummy_data_layer.o: layers/dummy_data_layer.cpp.o
+.PHONY : layers/dummy_data_layer.o
+
+# target to build an object file
+layers/dummy_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
+.PHONY : layers/dummy_data_layer.cpp.o
+
+layers/dummy_data_layer.i: layers/dummy_data_layer.cpp.i
+.PHONY : layers/dummy_data_layer.i
+
+# target to preprocess a source file
+layers/dummy_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i
+.PHONY : layers/dummy_data_layer.cpp.i
+
+layers/dummy_data_layer.s: layers/dummy_data_layer.cpp.s
+.PHONY : layers/dummy_data_layer.s
+
+# target to generate assembly for a file
+layers/dummy_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s
+.PHONY : layers/dummy_data_layer.cpp.s
+
+layers/eltwise_layer.o: layers/eltwise_layer.cpp.o
+.PHONY : layers/eltwise_layer.o
+
+# target to build an object file
+layers/eltwise_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
+.PHONY : layers/eltwise_layer.cpp.o
+
+layers/eltwise_layer.i: layers/eltwise_layer.cpp.i
+.PHONY : layers/eltwise_layer.i
+
+# target to preprocess a source file
+layers/eltwise_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i
+.PHONY : layers/eltwise_layer.cpp.i
+
+layers/eltwise_layer.s: layers/eltwise_layer.cpp.s
+.PHONY : layers/eltwise_layer.s
+
+# target to generate assembly for a file
+layers/eltwise_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s
+.PHONY : layers/eltwise_layer.cpp.s
+
+layers/euclidean_loss_layer.o: layers/euclidean_loss_layer.cpp.o
+.PHONY : layers/euclidean_loss_layer.o
+
+# target to build an object file
+layers/euclidean_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
+.PHONY : layers/euclidean_loss_layer.cpp.o
+
+layers/euclidean_loss_layer.i: layers/euclidean_loss_layer.cpp.i
+.PHONY : layers/euclidean_loss_layer.i
+
+# target to preprocess a source file
+layers/euclidean_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i
+.PHONY : layers/euclidean_loss_layer.cpp.i
+
+layers/euclidean_loss_layer.s: layers/euclidean_loss_layer.cpp.s
+.PHONY : layers/euclidean_loss_layer.s
+
+# target to generate assembly for a file
+layers/euclidean_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s
+.PHONY : layers/euclidean_loss_layer.cpp.s
+
+layers/exp_layer.o: layers/exp_layer.cpp.o
+.PHONY : layers/exp_layer.o
+
+# target to build an object file
+layers/exp_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
+.PHONY : layers/exp_layer.cpp.o
+
+layers/exp_layer.i: layers/exp_layer.cpp.i
+.PHONY : layers/exp_layer.i
+
+# target to preprocess a source file
+layers/exp_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i
+.PHONY : layers/exp_layer.cpp.i
+
+layers/exp_layer.s: layers/exp_layer.cpp.s
+.PHONY : layers/exp_layer.s
+
+# target to generate assembly for a file
+layers/exp_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s
+.PHONY : layers/exp_layer.cpp.s
+
+layers/filter_layer.o: layers/filter_layer.cpp.o
+.PHONY : layers/filter_layer.o
+
+# target to build an object file
+layers/filter_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
+.PHONY : layers/filter_layer.cpp.o
+
+layers/filter_layer.i: layers/filter_layer.cpp.i
+.PHONY : layers/filter_layer.i
+
+# target to preprocess a source file
+layers/filter_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i
+.PHONY : layers/filter_layer.cpp.i
+
+layers/filter_layer.s: layers/filter_layer.cpp.s
+.PHONY : layers/filter_layer.s
+
+# target to generate assembly for a file
+layers/filter_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s
+.PHONY : layers/filter_layer.cpp.s
+
+layers/flatten_layer.o: layers/flatten_layer.cpp.o
+.PHONY : layers/flatten_layer.o
+
+# target to build an object file
+layers/flatten_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
+.PHONY : layers/flatten_layer.cpp.o
+
+layers/flatten_layer.i: layers/flatten_layer.cpp.i
+.PHONY : layers/flatten_layer.i
+
+# target to preprocess a source file
+layers/flatten_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i
+.PHONY : layers/flatten_layer.cpp.i
+
+layers/flatten_layer.s: layers/flatten_layer.cpp.s
+.PHONY : layers/flatten_layer.s
+
+# target to generate assembly for a file
+layers/flatten_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s
+.PHONY : layers/flatten_layer.cpp.s
+
+layers/hdf5_data_layer.o: layers/hdf5_data_layer.cpp.o
+.PHONY : layers/hdf5_data_layer.o
+
+# target to build an object file
+layers/hdf5_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
+.PHONY : layers/hdf5_data_layer.cpp.o
+
+layers/hdf5_data_layer.i: layers/hdf5_data_layer.cpp.i
+.PHONY : layers/hdf5_data_layer.i
+
+# target to preprocess a source file
+layers/hdf5_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i
+.PHONY : layers/hdf5_data_layer.cpp.i
+
+layers/hdf5_data_layer.s: layers/hdf5_data_layer.cpp.s
+.PHONY : layers/hdf5_data_layer.s
+
+# target to generate assembly for a file
+layers/hdf5_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s
+.PHONY : layers/hdf5_data_layer.cpp.s
+
+layers/hdf5_output_layer.o: layers/hdf5_output_layer.cpp.o
+.PHONY : layers/hdf5_output_layer.o
+
+# target to build an object file
+layers/hdf5_output_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
+.PHONY : layers/hdf5_output_layer.cpp.o
+
+layers/hdf5_output_layer.i: layers/hdf5_output_layer.cpp.i
+.PHONY : layers/hdf5_output_layer.i
+
+# target to preprocess a source file
+layers/hdf5_output_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i
+.PHONY : layers/hdf5_output_layer.cpp.i
+
+layers/hdf5_output_layer.s: layers/hdf5_output_layer.cpp.s
+.PHONY : layers/hdf5_output_layer.s
+
+# target to generate assembly for a file
+layers/hdf5_output_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s
+.PHONY : layers/hdf5_output_layer.cpp.s
+
+layers/hinge_loss_layer.o: layers/hinge_loss_layer.cpp.o
+.PHONY : layers/hinge_loss_layer.o
+
+# target to build an object file
+layers/hinge_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
+.PHONY : layers/hinge_loss_layer.cpp.o
+
+layers/hinge_loss_layer.i: layers/hinge_loss_layer.cpp.i
+.PHONY : layers/hinge_loss_layer.i
+
+# target to preprocess a source file
+layers/hinge_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i
+.PHONY : layers/hinge_loss_layer.cpp.i
+
+layers/hinge_loss_layer.s: layers/hinge_loss_layer.cpp.s
+.PHONY : layers/hinge_loss_layer.s
+
+# target to generate assembly for a file
+layers/hinge_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s
+.PHONY : layers/hinge_loss_layer.cpp.s
+
+layers/im2col_layer.o: layers/im2col_layer.cpp.o
+.PHONY : layers/im2col_layer.o
+
+# target to build an object file
+layers/im2col_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
+.PHONY : layers/im2col_layer.cpp.o
+
+layers/im2col_layer.i: layers/im2col_layer.cpp.i
+.PHONY : layers/im2col_layer.i
+
+# target to preprocess a source file
+layers/im2col_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i
+.PHONY : layers/im2col_layer.cpp.i
+
+layers/im2col_layer.s: layers/im2col_layer.cpp.s
+.PHONY : layers/im2col_layer.s
+
+# target to generate assembly for a file
+layers/im2col_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s
+.PHONY : layers/im2col_layer.cpp.s
+
+layers/image_data_layer.o: layers/image_data_layer.cpp.o
+.PHONY : layers/image_data_layer.o
+
+# target to build an object file
+layers/image_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
+.PHONY : layers/image_data_layer.cpp.o
+
+layers/image_data_layer.i: layers/image_data_layer.cpp.i
+.PHONY : layers/image_data_layer.i
+
+# target to preprocess a source file
+layers/image_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i
+.PHONY : layers/image_data_layer.cpp.i
+
+layers/image_data_layer.s: layers/image_data_layer.cpp.s
+.PHONY : layers/image_data_layer.s
+
+# target to generate assembly for a file
+layers/image_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s
+.PHONY : layers/image_data_layer.cpp.s
+
+layers/infogain_loss_layer.o: layers/infogain_loss_layer.cpp.o
+.PHONY : layers/infogain_loss_layer.o
+
+# target to build an object file
+layers/infogain_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
+.PHONY : layers/infogain_loss_layer.cpp.o
+
+layers/infogain_loss_layer.i: layers/infogain_loss_layer.cpp.i
+.PHONY : layers/infogain_loss_layer.i
+
+# target to preprocess a source file
+layers/infogain_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i
+.PHONY : layers/infogain_loss_layer.cpp.i
+
+layers/infogain_loss_layer.s: layers/infogain_loss_layer.cpp.s
+.PHONY : layers/infogain_loss_layer.s
+
+# target to generate assembly for a file
+layers/infogain_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s
+.PHONY : layers/infogain_loss_layer.cpp.s
+
+layers/inner_product_layer.o: layers/inner_product_layer.cpp.o
+.PHONY : layers/inner_product_layer.o
+
+# target to build an object file
+layers/inner_product_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
+.PHONY : layers/inner_product_layer.cpp.o
+
+layers/inner_product_layer.i: layers/inner_product_layer.cpp.i
+.PHONY : layers/inner_product_layer.i
+
+# target to preprocess a source file
+layers/inner_product_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i
+.PHONY : layers/inner_product_layer.cpp.i
+
+layers/inner_product_layer.s: layers/inner_product_layer.cpp.s
+.PHONY : layers/inner_product_layer.s
+
+# target to generate assembly for a file
+layers/inner_product_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s
+.PHONY : layers/inner_product_layer.cpp.s
+
+layers/log_layer.o: layers/log_layer.cpp.o
+.PHONY : layers/log_layer.o
+
+# target to build an object file
+layers/log_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
+.PHONY : layers/log_layer.cpp.o
+
+layers/log_layer.i: layers/log_layer.cpp.i
+.PHONY : layers/log_layer.i
+
+# target to preprocess a source file
+layers/log_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i
+.PHONY : layers/log_layer.cpp.i
+
+layers/log_layer.s: layers/log_layer.cpp.s
+.PHONY : layers/log_layer.s
+
+# target to generate assembly for a file
+layers/log_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s
+.PHONY : layers/log_layer.cpp.s
+
+layers/loss_layer.o: layers/loss_layer.cpp.o
+.PHONY : layers/loss_layer.o
+
+# target to build an object file
+layers/loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
+.PHONY : layers/loss_layer.cpp.o
+
+layers/loss_layer.i: layers/loss_layer.cpp.i
+.PHONY : layers/loss_layer.i
+
+# target to preprocess a source file
+layers/loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i
+.PHONY : layers/loss_layer.cpp.i
+
+layers/loss_layer.s: layers/loss_layer.cpp.s
+.PHONY : layers/loss_layer.s
+
+# target to generate assembly for a file
+layers/loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s
+.PHONY : layers/loss_layer.cpp.s
+
+layers/lrn_layer.o: layers/lrn_layer.cpp.o
+.PHONY : layers/lrn_layer.o
+
+# target to build an object file
+layers/lrn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
+.PHONY : layers/lrn_layer.cpp.o
+
+layers/lrn_layer.i: layers/lrn_layer.cpp.i
+.PHONY : layers/lrn_layer.i
+
+# target to preprocess a source file
+layers/lrn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i
+.PHONY : layers/lrn_layer.cpp.i
+
+layers/lrn_layer.s: layers/lrn_layer.cpp.s
+.PHONY : layers/lrn_layer.s
+
+# target to generate assembly for a file
+layers/lrn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s
+.PHONY : layers/lrn_layer.cpp.s
+
+layers/memory_data_layer.o: layers/memory_data_layer.cpp.o
+.PHONY : layers/memory_data_layer.o
+
+# target to build an object file
+layers/memory_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
+.PHONY : layers/memory_data_layer.cpp.o
+
+layers/memory_data_layer.i: layers/memory_data_layer.cpp.i
+.PHONY : layers/memory_data_layer.i
+
+# target to preprocess a source file
+layers/memory_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i
+.PHONY : layers/memory_data_layer.cpp.i
+
+layers/memory_data_layer.s: layers/memory_data_layer.cpp.s
+.PHONY : layers/memory_data_layer.s
+
+# target to generate assembly for a file
+layers/memory_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s
+.PHONY : layers/memory_data_layer.cpp.s
+
+layers/multinomial_logistic_loss_layer.o: layers/multinomial_logistic_loss_layer.cpp.o
+.PHONY : layers/multinomial_logistic_loss_layer.o
+
+# target to build an object file
+layers/multinomial_logistic_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
+.PHONY : layers/multinomial_logistic_loss_layer.cpp.o
+
+layers/multinomial_logistic_loss_layer.i: layers/multinomial_logistic_loss_layer.cpp.i
+.PHONY : layers/multinomial_logistic_loss_layer.i
+
+# target to preprocess a source file
+layers/multinomial_logistic_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i
+.PHONY : layers/multinomial_logistic_loss_layer.cpp.i
+
+layers/multinomial_logistic_loss_layer.s: layers/multinomial_logistic_loss_layer.cpp.s
+.PHONY : layers/multinomial_logistic_loss_layer.s
+
+# target to generate assembly for a file
+layers/multinomial_logistic_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s
+.PHONY : layers/multinomial_logistic_loss_layer.cpp.s
+
+layers/mvn_layer.o: layers/mvn_layer.cpp.o
+.PHONY : layers/mvn_layer.o
+
+# target to build an object file
+layers/mvn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
+.PHONY : layers/mvn_layer.cpp.o
+
+layers/mvn_layer.i: layers/mvn_layer.cpp.i
+.PHONY : layers/mvn_layer.i
+
+# target to preprocess a source file
+layers/mvn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i
+.PHONY : layers/mvn_layer.cpp.i
+
+layers/mvn_layer.s: layers/mvn_layer.cpp.s
+.PHONY : layers/mvn_layer.s
+
+# target to generate assembly for a file
+layers/mvn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s
+.PHONY : layers/mvn_layer.cpp.s
+
+layers/neuron_layer.o: layers/neuron_layer.cpp.o
+.PHONY : layers/neuron_layer.o
+
+# target to build an object file
+layers/neuron_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
+.PHONY : layers/neuron_layer.cpp.o
+
+layers/neuron_layer.i: layers/neuron_layer.cpp.i
+.PHONY : layers/neuron_layer.i
+
+# target to preprocess a source file
+layers/neuron_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i
+.PHONY : layers/neuron_layer.cpp.i
+
+layers/neuron_layer.s: layers/neuron_layer.cpp.s
+.PHONY : layers/neuron_layer.s
+
+# target to generate assembly for a file
+layers/neuron_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s
+.PHONY : layers/neuron_layer.cpp.s
+
+layers/pooling_layer.o: layers/pooling_layer.cpp.o
+.PHONY : layers/pooling_layer.o
+
+# target to build an object file
+layers/pooling_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
+.PHONY : layers/pooling_layer.cpp.o
+
+layers/pooling_layer.i: layers/pooling_layer.cpp.i
+.PHONY : layers/pooling_layer.i
+
+# target to preprocess a source file
+layers/pooling_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i
+.PHONY : layers/pooling_layer.cpp.i
+
+layers/pooling_layer.s: layers/pooling_layer.cpp.s
+.PHONY : layers/pooling_layer.s
+
+# target to generate assembly for a file
+layers/pooling_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s
+.PHONY : layers/pooling_layer.cpp.s
+
+layers/power_layer.o: layers/power_layer.cpp.o
+.PHONY : layers/power_layer.o
+
+# target to build an object file
+layers/power_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
+.PHONY : layers/power_layer.cpp.o
+
+layers/power_layer.i: layers/power_layer.cpp.i
+.PHONY : layers/power_layer.i
+
+# target to preprocess a source file
+layers/power_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i
+.PHONY : layers/power_layer.cpp.i
+
+layers/power_layer.s: layers/power_layer.cpp.s
+.PHONY : layers/power_layer.s
+
+# target to generate assembly for a file
+layers/power_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s
+.PHONY : layers/power_layer.cpp.s
+
+layers/prelu_layer.o: layers/prelu_layer.cpp.o
+.PHONY : layers/prelu_layer.o
+
+# target to build an object file
+layers/prelu_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
+.PHONY : layers/prelu_layer.cpp.o
+
+layers/prelu_layer.i: layers/prelu_layer.cpp.i
+.PHONY : layers/prelu_layer.i
+
+# target to preprocess a source file
+layers/prelu_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i
+.PHONY : layers/prelu_layer.cpp.i
+
+layers/prelu_layer.s: layers/prelu_layer.cpp.s
+.PHONY : layers/prelu_layer.s
+
+# target to generate assembly for a file
+layers/prelu_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s
+.PHONY : layers/prelu_layer.cpp.s
+
+layers/reduction_layer.o: layers/reduction_layer.cpp.o
+.PHONY : layers/reduction_layer.o
+
+# target to build an object file
+layers/reduction_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
+.PHONY : layers/reduction_layer.cpp.o
+
+layers/reduction_layer.i: layers/reduction_layer.cpp.i
+.PHONY : layers/reduction_layer.i
+
+# target to preprocess a source file
+layers/reduction_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i
+.PHONY : layers/reduction_layer.cpp.i
+
+layers/reduction_layer.s: layers/reduction_layer.cpp.s
+.PHONY : layers/reduction_layer.s
+
+# target to generate assembly for a file
+layers/reduction_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s
+.PHONY : layers/reduction_layer.cpp.s
+
+layers/relu_layer.o: layers/relu_layer.cpp.o
+.PHONY : layers/relu_layer.o
+
+# target to build an object file
+layers/relu_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
+.PHONY : layers/relu_layer.cpp.o
+
+layers/relu_layer.i: layers/relu_layer.cpp.i
+.PHONY : layers/relu_layer.i
+
+# target to preprocess a source file
+layers/relu_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i
+.PHONY : layers/relu_layer.cpp.i
+
+layers/relu_layer.s: layers/relu_layer.cpp.s
+.PHONY : layers/relu_layer.s
+
+# target to generate assembly for a file
+layers/relu_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s
+.PHONY : layers/relu_layer.cpp.s
+
+layers/reshape_layer.o: layers/reshape_layer.cpp.o
+.PHONY : layers/reshape_layer.o
+
+# target to build an object file
+layers/reshape_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
+.PHONY : layers/reshape_layer.cpp.o
+
+layers/reshape_layer.i: layers/reshape_layer.cpp.i
+.PHONY : layers/reshape_layer.i
+
+# target to preprocess a source file
+layers/reshape_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i
+.PHONY : layers/reshape_layer.cpp.i
+
+layers/reshape_layer.s: layers/reshape_layer.cpp.s
+.PHONY : layers/reshape_layer.s
+
+# target to generate assembly for a file
+layers/reshape_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s
+.PHONY : layers/reshape_layer.cpp.s
+
+layers/sigmoid_cross_entropy_loss_layer.o: layers/sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.o
+
+# target to build an object file
+layers/sigmoid_cross_entropy_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.o
+
+layers/sigmoid_cross_entropy_loss_layer.i: layers/sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.i
+
+# target to preprocess a source file
+layers/sigmoid_cross_entropy_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.i
+
+layers/sigmoid_cross_entropy_loss_layer.s: layers/sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.s
+
+# target to generate assembly for a file
+layers/sigmoid_cross_entropy_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.s
+
+layers/sigmoid_layer.o: layers/sigmoid_layer.cpp.o
+.PHONY : layers/sigmoid_layer.o
+
+# target to build an object file
+layers/sigmoid_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
+.PHONY : layers/sigmoid_layer.cpp.o
+
+layers/sigmoid_layer.i: layers/sigmoid_layer.cpp.i
+.PHONY : layers/sigmoid_layer.i
+
+# target to preprocess a source file
+layers/sigmoid_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i
+.PHONY : layers/sigmoid_layer.cpp.i
+
+layers/sigmoid_layer.s: layers/sigmoid_layer.cpp.s
+.PHONY : layers/sigmoid_layer.s
+
+# target to generate assembly for a file
+layers/sigmoid_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s
+.PHONY : layers/sigmoid_layer.cpp.s
+
+layers/silence_layer.o: layers/silence_layer.cpp.o
+.PHONY : layers/silence_layer.o
+
+# target to build an object file
+layers/silence_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
+.PHONY : layers/silence_layer.cpp.o
+
+layers/silence_layer.i: layers/silence_layer.cpp.i
+.PHONY : layers/silence_layer.i
+
+# target to preprocess a source file
+layers/silence_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i
+.PHONY : layers/silence_layer.cpp.i
+
+layers/silence_layer.s: layers/silence_layer.cpp.s
+.PHONY : layers/silence_layer.s
+
+# target to generate assembly for a file
+layers/silence_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s
+.PHONY : layers/silence_layer.cpp.s
+
+layers/slice_layer.o: layers/slice_layer.cpp.o
+.PHONY : layers/slice_layer.o
+
+# target to build an object file
+layers/slice_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
+.PHONY : layers/slice_layer.cpp.o
+
+layers/slice_layer.i: layers/slice_layer.cpp.i
+.PHONY : layers/slice_layer.i
+
+# target to preprocess a source file
+layers/slice_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i
+.PHONY : layers/slice_layer.cpp.i
+
+layers/slice_layer.s: layers/slice_layer.cpp.s
+.PHONY : layers/slice_layer.s
+
+# target to generate assembly for a file
+layers/slice_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s
+.PHONY : layers/slice_layer.cpp.s
+
+layers/softmax_layer.o: layers/softmax_layer.cpp.o
+.PHONY : layers/softmax_layer.o
+
+# target to build an object file
+layers/softmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
+.PHONY : layers/softmax_layer.cpp.o
+
+layers/softmax_layer.i: layers/softmax_layer.cpp.i
+.PHONY : layers/softmax_layer.i
+
+# target to preprocess a source file
+layers/softmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i
+.PHONY : layers/softmax_layer.cpp.i
+
+layers/softmax_layer.s: layers/softmax_layer.cpp.s
+.PHONY : layers/softmax_layer.s
+
+# target to generate assembly for a file
+layers/softmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s
+.PHONY : layers/softmax_layer.cpp.s
+
+layers/softmax_loss_layer.o: layers/softmax_loss_layer.cpp.o
+.PHONY : layers/softmax_loss_layer.o
+
+# target to build an object file
+layers/softmax_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
+.PHONY : layers/softmax_loss_layer.cpp.o
+
+layers/softmax_loss_layer.i: layers/softmax_loss_layer.cpp.i
+.PHONY : layers/softmax_loss_layer.i
+
+# target to preprocess a source file
+layers/softmax_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i
+.PHONY : layers/softmax_loss_layer.cpp.i
+
+layers/softmax_loss_layer.s: layers/softmax_loss_layer.cpp.s
+.PHONY : layers/softmax_loss_layer.s
+
+# target to generate assembly for a file
+layers/softmax_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s
+.PHONY : layers/softmax_loss_layer.cpp.s
+
+layers/split_layer.o: layers/split_layer.cpp.o
+.PHONY : layers/split_layer.o
+
+# target to build an object file
+layers/split_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
+.PHONY : layers/split_layer.cpp.o
+
+layers/split_layer.i: layers/split_layer.cpp.i
+.PHONY : layers/split_layer.i
+
+# target to preprocess a source file
+layers/split_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i
+.PHONY : layers/split_layer.cpp.i
+
+layers/split_layer.s: layers/split_layer.cpp.s
+.PHONY : layers/split_layer.s
+
+# target to generate assembly for a file
+layers/split_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s
+.PHONY : layers/split_layer.cpp.s
+
+layers/spp_layer.o: layers/spp_layer.cpp.o
+.PHONY : layers/spp_layer.o
+
+# target to build an object file
+layers/spp_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
+.PHONY : layers/spp_layer.cpp.o
+
+layers/spp_layer.i: layers/spp_layer.cpp.i
+.PHONY : layers/spp_layer.i
+
+# target to preprocess a source file
+layers/spp_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i
+.PHONY : layers/spp_layer.cpp.i
+
+layers/spp_layer.s: layers/spp_layer.cpp.s
+.PHONY : layers/spp_layer.s
+
+# target to generate assembly for a file
+layers/spp_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s
+.PHONY : layers/spp_layer.cpp.s
+
+layers/tanh_layer.o: layers/tanh_layer.cpp.o
+.PHONY : layers/tanh_layer.o
+
+# target to build an object file
+layers/tanh_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
+.PHONY : layers/tanh_layer.cpp.o
+
+layers/tanh_layer.i: layers/tanh_layer.cpp.i
+.PHONY : layers/tanh_layer.i
+
+# target to preprocess a source file
+layers/tanh_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i
+.PHONY : layers/tanh_layer.cpp.i
+
+layers/tanh_layer.s: layers/tanh_layer.cpp.s
+.PHONY : layers/tanh_layer.s
+
+# target to generate assembly for a file
+layers/tanh_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s
+.PHONY : layers/tanh_layer.cpp.s
+
+layers/threshold_layer.o: layers/threshold_layer.cpp.o
+.PHONY : layers/threshold_layer.o
+
+# target to build an object file
+layers/threshold_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
+.PHONY : layers/threshold_layer.cpp.o
+
+layers/threshold_layer.i: layers/threshold_layer.cpp.i
+.PHONY : layers/threshold_layer.i
+
+# target to preprocess a source file
+layers/threshold_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i
+.PHONY : layers/threshold_layer.cpp.i
+
+layers/threshold_layer.s: layers/threshold_layer.cpp.s
+.PHONY : layers/threshold_layer.s
+
+# target to generate assembly for a file
+layers/threshold_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s
+.PHONY : layers/threshold_layer.cpp.s
+
+layers/window_data_layer.o: layers/window_data_layer.cpp.o
+.PHONY : layers/window_data_layer.o
+
+# target to build an object file
+layers/window_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
+.PHONY : layers/window_data_layer.cpp.o
+
+layers/window_data_layer.i: layers/window_data_layer.cpp.i
+.PHONY : layers/window_data_layer.i
+
+# target to preprocess a source file
+layers/window_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i
+.PHONY : layers/window_data_layer.cpp.i
+
+layers/window_data_layer.s: layers/window_data_layer.cpp.s
+.PHONY : layers/window_data_layer.s
+
+# target to generate assembly for a file
+layers/window_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s
+.PHONY : layers/window_data_layer.cpp.s
+
+net.o: net.cpp.o
+.PHONY : net.o
+
+# target to build an object file
+net.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o
+.PHONY : net.cpp.o
+
+net.i: net.cpp.i
+.PHONY : net.i
+
+# target to preprocess a source file
+net.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.i
+.PHONY : net.cpp.i
+
+net.s: net.cpp.s
+.PHONY : net.s
+
+# target to generate assembly for a file
+net.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.s
+.PHONY : net.cpp.s
+
+solver.o: solver.cpp.o
+.PHONY : solver.o
+
+# target to build an object file
+solver.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
+.PHONY : solver.cpp.o
+
+solver.i: solver.cpp.i
+.PHONY : solver.i
+
+# target to preprocess a source file
+solver.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.i
+.PHONY : solver.cpp.i
+
+solver.s: solver.cpp.s
+.PHONY : solver.s
+
+# target to generate assembly for a file
+solver.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.s
+.PHONY : solver.cpp.s
+
+syncedmem.o: syncedmem.cpp.o
+.PHONY : syncedmem.o
+
+# target to build an object file
+syncedmem.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
+.PHONY : syncedmem.cpp.o
+
+syncedmem.i: syncedmem.cpp.i
+.PHONY : syncedmem.i
+
+# target to preprocess a source file
+syncedmem.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i
+.PHONY : syncedmem.cpp.i
+
+syncedmem.s: syncedmem.cpp.s
+.PHONY : syncedmem.s
+
+# target to generate assembly for a file
+syncedmem.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s
+.PHONY : syncedmem.cpp.s
+
+util/benchmark.o: util/benchmark.cpp.o
+.PHONY : util/benchmark.o
+
+# target to build an object file
+util/benchmark.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
+.PHONY : util/benchmark.cpp.o
+
+util/benchmark.i: util/benchmark.cpp.i
+.PHONY : util/benchmark.i
+
+# target to preprocess a source file
+util/benchmark.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i
+.PHONY : util/benchmark.cpp.i
+
+util/benchmark.s: util/benchmark.cpp.s
+.PHONY : util/benchmark.s
+
+# target to generate assembly for a file
+util/benchmark.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s
+.PHONY : util/benchmark.cpp.s
+
+util/cudnn.o: util/cudnn.cpp.o
+.PHONY : util/cudnn.o
+
+# target to build an object file
+util/cudnn.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
+.PHONY : util/cudnn.cpp.o
+
+util/cudnn.i: util/cudnn.cpp.i
+.PHONY : util/cudnn.i
+
+# target to preprocess a source file
+util/cudnn.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i
+.PHONY : util/cudnn.cpp.i
+
+util/cudnn.s: util/cudnn.cpp.s
+.PHONY : util/cudnn.s
+
+# target to generate assembly for a file
+util/cudnn.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s
+.PHONY : util/cudnn.cpp.s
+
+util/db.o: util/db.cpp.o
+.PHONY : util/db.o
+
+# target to build an object file
+util/db.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
+.PHONY : util/db.cpp.o
+
+util/db.i: util/db.cpp.i
+.PHONY : util/db.i
+
+# target to preprocess a source file
+util/db.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i
+.PHONY : util/db.cpp.i
+
+util/db.s: util/db.cpp.s
+.PHONY : util/db.s
+
+# target to generate assembly for a file
+util/db.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s
+.PHONY : util/db.cpp.s
+
+util/db_leveldb.o: util/db_leveldb.cpp.o
+.PHONY : util/db_leveldb.o
+
+# target to build an object file
+util/db_leveldb.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
+.PHONY : util/db_leveldb.cpp.o
+
+util/db_leveldb.i: util/db_leveldb.cpp.i
+.PHONY : util/db_leveldb.i
+
+# target to preprocess a source file
+util/db_leveldb.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i
+.PHONY : util/db_leveldb.cpp.i
+
+util/db_leveldb.s: util/db_leveldb.cpp.s
+.PHONY : util/db_leveldb.s
+
+# target to generate assembly for a file
+util/db_leveldb.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s
+.PHONY : util/db_leveldb.cpp.s
+
+util/db_lmdb.o: util/db_lmdb.cpp.o
+.PHONY : util/db_lmdb.o
+
+# target to build an object file
+util/db_lmdb.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
+.PHONY : util/db_lmdb.cpp.o
+
+util/db_lmdb.i: util/db_lmdb.cpp.i
+.PHONY : util/db_lmdb.i
+
+# target to preprocess a source file
+util/db_lmdb.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i
+.PHONY : util/db_lmdb.cpp.i
+
+util/db_lmdb.s: util/db_lmdb.cpp.s
+.PHONY : util/db_lmdb.s
+
+# target to generate assembly for a file
+util/db_lmdb.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s
+.PHONY : util/db_lmdb.cpp.s
+
+util/im2col.o: util/im2col.cpp.o
+.PHONY : util/im2col.o
+
+# target to build an object file
+util/im2col.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
+.PHONY : util/im2col.cpp.o
+
+util/im2col.i: util/im2col.cpp.i
+.PHONY : util/im2col.i
+
+# target to preprocess a source file
+util/im2col.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i
+.PHONY : util/im2col.cpp.i
+
+util/im2col.s: util/im2col.cpp.s
+.PHONY : util/im2col.s
+
+# target to generate assembly for a file
+util/im2col.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s
+.PHONY : util/im2col.cpp.s
+
+util/insert_splits.o: util/insert_splits.cpp.o
+.PHONY : util/insert_splits.o
+
+# target to build an object file
+util/insert_splits.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
+.PHONY : util/insert_splits.cpp.o
+
+util/insert_splits.i: util/insert_splits.cpp.i
+.PHONY : util/insert_splits.i
+
+# target to preprocess a source file
+util/insert_splits.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i
+.PHONY : util/insert_splits.cpp.i
+
+util/insert_splits.s: util/insert_splits.cpp.s
+.PHONY : util/insert_splits.s
+
+# target to generate assembly for a file
+util/insert_splits.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s
+.PHONY : util/insert_splits.cpp.s
+
+util/io.o: util/io.cpp.o
+.PHONY : util/io.o
+
+# target to build an object file
+util/io.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
+.PHONY : util/io.cpp.o
+
+util/io.i: util/io.cpp.i
+.PHONY : util/io.i
+
+# target to preprocess a source file
+util/io.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i
+.PHONY : util/io.cpp.i
+
+util/io.s: util/io.cpp.s
+.PHONY : util/io.s
+
+# target to generate assembly for a file
+util/io.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s
+.PHONY : util/io.cpp.s
+
+util/math_functions.o: util/math_functions.cpp.o
+.PHONY : util/math_functions.o
+
+# target to build an object file
+util/math_functions.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
+.PHONY : util/math_functions.cpp.o
+
+util/math_functions.i: util/math_functions.cpp.i
+.PHONY : util/math_functions.i
+
+# target to preprocess a source file
+util/math_functions.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i
+.PHONY : util/math_functions.cpp.i
+
+util/math_functions.s: util/math_functions.cpp.s
+.PHONY : util/math_functions.s
+
+# target to generate assembly for a file
+util/math_functions.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s
+.PHONY : util/math_functions.cpp.s
+
+util/ocl_util.o: util/ocl_util.cpp.o
+.PHONY : util/ocl_util.o
+
+# target to build an object file
+util/ocl_util.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
+.PHONY : util/ocl_util.cpp.o
+
+util/ocl_util.i: util/ocl_util.cpp.i
+.PHONY : util/ocl_util.i
+
+# target to preprocess a source file
+util/ocl_util.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i
+.PHONY : util/ocl_util.cpp.i
+
+util/ocl_util.s: util/ocl_util.cpp.s
+.PHONY : util/ocl_util.s
+
+# target to generate assembly for a file
+util/ocl_util.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s
+.PHONY : util/ocl_util.cpp.s
+
+util/ocl_wrapper.o: util/ocl_wrapper.cpp.o
+.PHONY : util/ocl_wrapper.o
+
+# target to build an object file
+util/ocl_wrapper.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
+.PHONY : util/ocl_wrapper.cpp.o
+
+util/ocl_wrapper.i: util/ocl_wrapper.cpp.i
+.PHONY : util/ocl_wrapper.i
+
+# target to preprocess a source file
+util/ocl_wrapper.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i
+.PHONY : util/ocl_wrapper.cpp.i
+
+util/ocl_wrapper.s: util/ocl_wrapper.cpp.s
+.PHONY : util/ocl_wrapper.s
+
+# target to generate assembly for a file
+util/ocl_wrapper.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s
+.PHONY : util/ocl_wrapper.cpp.s
+
+util/upgrade_proto.o: util/upgrade_proto.cpp.o
+.PHONY : util/upgrade_proto.o
+
+# target to build an object file
+util/upgrade_proto.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
+.PHONY : util/upgrade_proto.cpp.o
+
+util/upgrade_proto.i: util/upgrade_proto.cpp.i
+.PHONY : util/upgrade_proto.i
+
+# target to preprocess a source file
+util/upgrade_proto.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i
+.PHONY : util/upgrade_proto.cpp.i
+
+util/upgrade_proto.s: util/upgrade_proto.cpp.s
+.PHONY : util/upgrade_proto.s
+
+# target to generate assembly for a file
+util/upgrade_proto.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s
+.PHONY : util/upgrade_proto.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... caffe"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... proto"
+	@echo "... rebuild_cache"
+	@echo "... __/__/include/caffe/proto/caffe.pb.o"
+	@echo "... __/__/include/caffe/proto/caffe.pb.i"
+	@echo "... __/__/include/caffe/proto/caffe.pb.s"
+	@echo "... blob.o"
+	@echo "... blob.i"
+	@echo "... blob.s"
+	@echo "... common.o"
+	@echo "... common.i"
+	@echo "... common.s"
+	@echo "... data_transformer.o"
+	@echo "... data_transformer.i"
+	@echo "... data_transformer.s"
+	@echo "... device.o"
+	@echo "... device.i"
+	@echo "... device.s"
+	@echo "... internal_thread.o"
+	@echo "... internal_thread.i"
+	@echo "... internal_thread.s"
+	@echo "... layer_factory.o"
+	@echo "... layer_factory.i"
+	@echo "... layer_factory.s"
+	@echo "... layers/absval_layer.o"
+	@echo "... layers/absval_layer.i"
+	@echo "... layers/absval_layer.s"
+	@echo "... layers/accuracy_layer.o"
+	@echo "... layers/accuracy_layer.i"
+	@echo "... layers/accuracy_layer.s"
+	@echo "... layers/argmax_layer.o"
+	@echo "... layers/argmax_layer.i"
+	@echo "... layers/argmax_layer.s"
+	@echo "... layers/base_conv_layer.o"
+	@echo "... layers/base_conv_layer.i"
+	@echo "... layers/base_conv_layer.s"
+	@echo "... layers/base_data_layer.o"
+	@echo "... layers/base_data_layer.i"
+	@echo "... layers/base_data_layer.s"
+	@echo "... layers/bnll_layer.o"
+	@echo "... layers/bnll_layer.i"
+	@echo "... layers/bnll_layer.s"
+	@echo "... layers/concat_layer.o"
+	@echo "... layers/concat_layer.i"
+	@echo "... layers/concat_layer.s"
+	@echo "... layers/contrastive_loss_layer.o"
+	@echo "... layers/contrastive_loss_layer.i"
+	@echo "... layers/contrastive_loss_layer.s"
+	@echo "... layers/conv_layer.o"
+	@echo "... layers/conv_layer.i"
+	@echo "... layers/conv_layer.s"
+	@echo "... layers/cudnn_conv_layer.o"
+	@echo "... layers/cudnn_conv_layer.i"
+	@echo "... layers/cudnn_conv_layer.s"
+	@echo "... layers/cudnn_pooling_layer.o"
+	@echo "... layers/cudnn_pooling_layer.i"
+	@echo "... layers/cudnn_pooling_layer.s"
+	@echo "... layers/cudnn_relu_layer.o"
+	@echo "... layers/cudnn_relu_layer.i"
+	@echo "... layers/cudnn_relu_layer.s"
+	@echo "... layers/cudnn_sigmoid_layer.o"
+	@echo "... layers/cudnn_sigmoid_layer.i"
+	@echo "... layers/cudnn_sigmoid_layer.s"
+	@echo "... layers/cudnn_softmax_layer.o"
+	@echo "... layers/cudnn_softmax_layer.i"
+	@echo "... layers/cudnn_softmax_layer.s"
+	@echo "... layers/cudnn_tanh_layer.o"
+	@echo "... layers/cudnn_tanh_layer.i"
+	@echo "... layers/cudnn_tanh_layer.s"
+	@echo "... layers/data_layer.o"
+	@echo "... layers/data_layer.i"
+	@echo "... layers/data_layer.s"
+	@echo "... layers/deconv_layer.o"
+	@echo "... layers/deconv_layer.i"
+	@echo "... layers/deconv_layer.s"
+	@echo "... layers/dropout_layer.o"
+	@echo "... layers/dropout_layer.i"
+	@echo "... layers/dropout_layer.s"
+	@echo "... layers/dummy_data_layer.o"
+	@echo "... layers/dummy_data_layer.i"
+	@echo "... layers/dummy_data_layer.s"
+	@echo "... layers/eltwise_layer.o"
+	@echo "... layers/eltwise_layer.i"
+	@echo "... layers/eltwise_layer.s"
+	@echo "... layers/euclidean_loss_layer.o"
+	@echo "... layers/euclidean_loss_layer.i"
+	@echo "... layers/euclidean_loss_layer.s"
+	@echo "... layers/exp_layer.o"
+	@echo "... layers/exp_layer.i"
+	@echo "... layers/exp_layer.s"
+	@echo "... layers/filter_layer.o"
+	@echo "... layers/filter_layer.i"
+	@echo "... layers/filter_layer.s"
+	@echo "... layers/flatten_layer.o"
+	@echo "... layers/flatten_layer.i"
+	@echo "... layers/flatten_layer.s"
+	@echo "... layers/hdf5_data_layer.o"
+	@echo "... layers/hdf5_data_layer.i"
+	@echo "... layers/hdf5_data_layer.s"
+	@echo "... layers/hdf5_output_layer.o"
+	@echo "... layers/hdf5_output_layer.i"
+	@echo "... layers/hdf5_output_layer.s"
+	@echo "... layers/hinge_loss_layer.o"
+	@echo "... layers/hinge_loss_layer.i"
+	@echo "... layers/hinge_loss_layer.s"
+	@echo "... layers/im2col_layer.o"
+	@echo "... layers/im2col_layer.i"
+	@echo "... layers/im2col_layer.s"
+	@echo "... layers/image_data_layer.o"
+	@echo "... layers/image_data_layer.i"
+	@echo "... layers/image_data_layer.s"
+	@echo "... layers/infogain_loss_layer.o"
+	@echo "... layers/infogain_loss_layer.i"
+	@echo "... layers/infogain_loss_layer.s"
+	@echo "... layers/inner_product_layer.o"
+	@echo "... layers/inner_product_layer.i"
+	@echo "... layers/inner_product_layer.s"
+	@echo "... layers/log_layer.o"
+	@echo "... layers/log_layer.i"
+	@echo "... layers/log_layer.s"
+	@echo "... layers/loss_layer.o"
+	@echo "... layers/loss_layer.i"
+	@echo "... layers/loss_layer.s"
+	@echo "... layers/lrn_layer.o"
+	@echo "... layers/lrn_layer.i"
+	@echo "... layers/lrn_layer.s"
+	@echo "... layers/memory_data_layer.o"
+	@echo "... layers/memory_data_layer.i"
+	@echo "... layers/memory_data_layer.s"
+	@echo "... layers/multinomial_logistic_loss_layer.o"
+	@echo "... layers/multinomial_logistic_loss_layer.i"
+	@echo "... layers/multinomial_logistic_loss_layer.s"
+	@echo "... layers/mvn_layer.o"
+	@echo "... layers/mvn_layer.i"
+	@echo "... layers/mvn_layer.s"
+	@echo "... layers/neuron_layer.o"
+	@echo "... layers/neuron_layer.i"
+	@echo "... layers/neuron_layer.s"
+	@echo "... layers/pooling_layer.o"
+	@echo "... layers/pooling_layer.i"
+	@echo "... layers/pooling_layer.s"
+	@echo "... layers/power_layer.o"
+	@echo "... layers/power_layer.i"
+	@echo "... layers/power_layer.s"
+	@echo "... layers/prelu_layer.o"
+	@echo "... layers/prelu_layer.i"
+	@echo "... layers/prelu_layer.s"
+	@echo "... layers/reduction_layer.o"
+	@echo "... layers/reduction_layer.i"
+	@echo "... layers/reduction_layer.s"
+	@echo "... layers/relu_layer.o"
+	@echo "... layers/relu_layer.i"
+	@echo "... layers/relu_layer.s"
+	@echo "... layers/reshape_layer.o"
+	@echo "... layers/reshape_layer.i"
+	@echo "... layers/reshape_layer.s"
+	@echo "... layers/sigmoid_cross_entropy_loss_layer.o"
+	@echo "... layers/sigmoid_cross_entropy_loss_layer.i"
+	@echo "... layers/sigmoid_cross_entropy_loss_layer.s"
+	@echo "... layers/sigmoid_layer.o"
+	@echo "... layers/sigmoid_layer.i"
+	@echo "... layers/sigmoid_layer.s"
+	@echo "... layers/silence_layer.o"
+	@echo "... layers/silence_layer.i"
+	@echo "... layers/silence_layer.s"
+	@echo "... layers/slice_layer.o"
+	@echo "... layers/slice_layer.i"
+	@echo "... layers/slice_layer.s"
+	@echo "... layers/softmax_layer.o"
+	@echo "... layers/softmax_layer.i"
+	@echo "... layers/softmax_layer.s"
+	@echo "... layers/softmax_loss_layer.o"
+	@echo "... layers/softmax_loss_layer.i"
+	@echo "... layers/softmax_loss_layer.s"
+	@echo "... layers/split_layer.o"
+	@echo "... layers/split_layer.i"
+	@echo "... layers/split_layer.s"
+	@echo "... layers/spp_layer.o"
+	@echo "... layers/spp_layer.i"
+	@echo "... layers/spp_layer.s"
+	@echo "... layers/tanh_layer.o"
+	@echo "... layers/tanh_layer.i"
+	@echo "... layers/tanh_layer.s"
+	@echo "... layers/threshold_layer.o"
+	@echo "... layers/threshold_layer.i"
+	@echo "... layers/threshold_layer.s"
+	@echo "... layers/window_data_layer.o"
+	@echo "... layers/window_data_layer.i"
+	@echo "... layers/window_data_layer.s"
+	@echo "... net.o"
+	@echo "... net.i"
+	@echo "... net.s"
+	@echo "... solver.o"
+	@echo "... solver.i"
+	@echo "... solver.s"
+	@echo "... syncedmem.o"
+	@echo "... syncedmem.i"
+	@echo "... syncedmem.s"
+	@echo "... util/benchmark.o"
+	@echo "... util/benchmark.i"
+	@echo "... util/benchmark.s"
+	@echo "... util/cudnn.o"
+	@echo "... util/cudnn.i"
+	@echo "... util/cudnn.s"
+	@echo "... util/db.o"
+	@echo "... util/db.i"
+	@echo "... util/db.s"
+	@echo "... util/db_leveldb.o"
+	@echo "... util/db_leveldb.i"
+	@echo "... util/db_leveldb.s"
+	@echo "... util/db_lmdb.o"
+	@echo "... util/db_lmdb.i"
+	@echo "... util/db_lmdb.s"
+	@echo "... util/im2col.o"
+	@echo "... util/im2col.i"
+	@echo "... util/im2col.s"
+	@echo "... util/insert_splits.o"
+	@echo "... util/insert_splits.i"
+	@echo "... util/insert_splits.s"
+	@echo "... util/io.o"
+	@echo "... util/io.i"
+	@echo "... util/io.s"
+	@echo "... util/math_functions.o"
+	@echo "... util/math_functions.i"
+	@echo "... util/math_functions.s"
+	@echo "... util/ocl_util.o"
+	@echo "... util/ocl_util.i"
+	@echo "... util/ocl_util.s"
+	@echo "... util/ocl_wrapper.o"
+	@echo "... util/ocl_wrapper.i"
+	@echo "... util/ocl_wrapper.s"
+	@echo "... util/upgrade_proto.o"
+	@echo "... util/upgrade_proto.i"
+	@echo "... util/upgrade_proto.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/caffe/cmake_install.cmake b/src/caffe/cmake_install.cmake
new file mode 100644
index 00000000..f98ef538
--- /dev/null
+++ b/src/caffe/cmake_install.cmake
@@ -0,0 +1,79 @@
+# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe
+
+# Set the install prefix
+IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
+ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  IF(BUILD_TYPE)
+    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  ELSE(BUILD_TYPE)
+    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
+  ENDIF(BUILD_TYPE)
+  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+
+# Set the component getting installed.
+IF(NOT CMAKE_INSTALL_COMPONENT)
+  IF(COMPONENT)
+    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
+    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  ELSE(COMPONENT)
+    SET(CMAKE_INSTALL_COMPONENT)
+  ENDIF(COMPONENT)
+ENDIF(NOT CMAKE_INSTALL_COMPONENT)
+
+# Install shared libraries without execute permission?
+IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  SET(CMAKE_INSTALL_SO_NO_EXE "1")
+ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include" TYPE DIRECTORY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/caffe/proto" TYPE FILE FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    FILE(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so"
+         RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib")
+  ENDIF()
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libcaffe.so")
+  IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    FILE(RPATH_CHANGE
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so"
+         OLD_RPATH "/usr/local/cuda/lib64:/usr/local/lib:::::::::::::::::::::::::::::::::::::::::::::::::::::::::"
+         NEW_RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib")
+    IF(CMAKE_INSTALL_DO_STRIP)
+      EXECUTE_PROCESS(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    ENDIF(CMAKE_INSTALL_DO_STRIP)
+  ENDIF()
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE STATIC_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libproto.a")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/python/caffe/proto" TYPE PROGRAM FILES
+    "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py"
+    "/home/yugao/caffe-merge-junli/caffe-yb/caffe/__init__.py"
+    )
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_LOCAL_ONLY)
+  # Include the install script for each subdirectory.
+  INCLUDE("/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/cmake_install.cmake")
+
+ENDIF(NOT CMAKE_INSTALL_LOCAL_ONLY)
+
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index c4fe1195..5d56493b 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -197,6 +197,7 @@ void Caffe::DeviceQuery() {
       << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
   return;
 */
+  amdDevice.DeviceQuery();
 }
 
 
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 3ce6cefe..7a866c11 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -44,7 +44,8 @@ std::string oclKernelPath="./src/caffe/ocl/";
 
 Device::~Device(){
     //clAmdBlasTeardown(); 
-     free((void*)platformIDs);
+    ReleaseKernels(); 
+    free((void*)platformIDs);
      free(DeviceIDs);
      clReleaseProgram(Program);
      clReleaseCommandQueue(CommandQueue);
@@ -74,7 +75,7 @@ cl_int Device::Init(){
     GetDeviceInfo();
     cl_uint uiNumDevices;
     cl_bool unified_memory = false;
-    switch(Caffe::mode()) {
+/*    switch(Caffe::mode()) {
     case Caffe::GPU:
          //choose_gpu();
       clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
@@ -107,7 +108,8 @@ cl_int Device::Init(){
          OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) );
          LOG(INFO) << "picked device type: CPU";
          break;
-    case Caffe::APU:
+*/  
+//  case Caffe::APU:
         clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
         uiNumDevices = numDevices;
         if(0 == uiNumDevices){
@@ -126,10 +128,10 @@ cl_int Device::Init(){
          }
        }
          LOG(INFO) << "picked device type: APU";
-         break;
-    default:
-         LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
+  //       break;
+  //  default:
+  //       LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  //  }
 
     //Create Context
     Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
@@ -315,6 +317,15 @@ cl_kernel Device::GetKernel(std::string kernel_name)
     return Kernels[kernel_name];
 }
 
+void Device::ReleaseKernels()
+{
+    std::map<std::string, cl_kernel>::iterator it;
+    for(it = Kernels.begin(); it != Kernels.end(); it++)
+    {
+        clReleaseKernel(it->second);
+    }
+}
+
 void Device::DisplayPlatformInfo(){
    cl_int err;
    size_t size;
@@ -413,6 +424,26 @@ void Device::GetDeviceInfo(){
     
 }
 
+void Device::DeviceQuery()
+{
+    //Get Platform Infomation
+    DisplayPlatformInfo();
+
+    clGetPlatformIDs(0, NULL, &numPlatforms);
+    cl_platform_id PlatformIDs[numPlatforms];
+    clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+    size_t nameLen;
+    cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
+    if(res != CL_SUCCESS){
+        fprintf(stderr, "Err: Failed to Get Platform Info\n", res);
+        return;
+    }
+    platformName[nameLen] = 0;
+
+    GetDeviceInfo();
+}
+
 template <typename T>
 void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){
     cl_int err;
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 855c00e1..8f7d8f82 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -33,7 +33,7 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
 
-  CHECK_BLOB_DATA(top[0],20, "top[0]");
+//  CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -67,9 +67,9 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-  CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
-  CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff");
+  //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
+  //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+  //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff");
 
 }
 
@@ -80,7 +80,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    Forward_gpu_opt(bottom, top);
   else
    Forward_gpu_org(bottom, top);
- CHECK_BLOB_DATA(top[0],20, "top[0]");
+// CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -160,7 +160,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   }
 
   // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-  CHECK_BLOB_DATA(top[0],20, "top[0]");
+ // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -256,10 +256,10 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
     }
   }
   
-  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
-  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
-  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
+//  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
+//  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+//  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
+ // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index a3cca01c..22456302 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -152,6 +152,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
   Dtype loss;
   caffe_gpu_asum(nthreads, loss_data, &loss);
+  printf("loss = %f\n", loss);
   if (normalize_) {
     Dtype count;
     caffe_gpu_asum(nthreads, counts, &count);
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index ad6bdc7e..f5d0e703 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -511,21 +511,17 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   forward_timer.Start();
 
   for (int i = start; i <= end; ++i) {
-   //double begin_time = GettickCount();
     layer_timer.Start();
-   //printf("Forwarding %s\n",layer_names_[i].c_str());
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
     clFinish(amdDevice.CommandQueue);
-    //double end_time = GettickCount();
     layer_timer.Stop();
-    //printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), end_time-begin_time);
     printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds());
   }
 
   forward_timer.Stop();
-  printf("Forward time: %f\n\n", forward_timer.MilliSeconds());
+  printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
 
   return loss;
 }
@@ -587,22 +583,23 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_LT(start, layers_.size());
   
   CPUTimer backward_timer;
+  CPUTimer layer_timer;
   backward_timer.Start();
 
   for (int i = start; i >= end; --i) {
+    layer_timer.Start();
     if (layer_need_backward_[i]) {
-//Yibing add for porting
-      printf("Backwarding %s\n",layer_names_[i].c_str());
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
       if (debug_info_) { BackwardDebugInfo(i); }
-//Yibing add for porting
     clFinish(amdDevice.CommandQueue);
+    layer_timer.Start();
+    printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds());
     }
   }
 
   backward_timer.Stop();
-  printf("Backward time: %f\n\n", backward_timer.MilliSeconds());
+  printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
 }
 
 template <typename Dtype>
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index d94efcba..b6a5a0a1 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -90,8 +90,8 @@ __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const
           }
 
 }
-template __attribute__((mangled_name(AvePoolForwardfloat))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
-template __attribute__((mangled_name(AvePoolForwarddouble))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
+template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
+template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
 
 template <class T>
 __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index 55026603..9710a343 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -34,6 +34,7 @@ __kernel void OCL_memset(__global T* buffer, const T value, const int size){
 	}
 }
 
+template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size);
 template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
 template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 63c8294c..f4b57a41 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -53,7 +53,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
 
 //#ifndef CPU_ONLY
   //AMD device related initialization
-  amdDevice.Init();
+  //amdDevice.Init();
   ocl_setup();
 //  cl_int err =  clblasSetup();
 //#else
@@ -236,7 +236,9 @@ void Solver<Dtype>::Step(int iters) {
       int idx = (iter_ - start_iter) % average_loss;
       smoothed_loss += (loss - losses[idx]) / average_loss;
       losses[idx] = loss;
+      printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx);
     }
+       printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %d \n", smoothed_loss,average_loss, losses.size());
     if (display) {
       LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
diff --git a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 00000000..7bb0014c
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Relative path conversion top directories.
+SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+
+# Force unix paths in dependencies.
+SET(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
new file mode 100644
index 00000000..895d9fca
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
@@ -0,0 +1,296 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "/usr/bin/cmake") # path
+set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_kernel.cu") # path
+set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.NVCC-depend") # path
+set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.depend") # path
+set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
+set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
+set(build_cubin OFF) # bool
+set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.") # path
+set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o") # path
+set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt") # path
+
+set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
+set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC;-Xcompiler;-fPIC ;; ) # list
+# Build specific configuration flags
+set(CUDA_NVCC_FLAGS_DEBUG  ; )
+set(CUDA_NVCC_FLAGS_RELEASE  ; )
+set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
+set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
+set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
+set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
+set(format_flag "-c") # string
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
+set(CMAKE_HOST_FLAGS_DEBUG -g)
+set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
+set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
+set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT _command STREQUAL "COMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION 6.5)
+if(CUDA_VERSION VERSION_LESS "3.0")
+  cmake_policy(PUSH)
+  # CMake policy 0007 NEW states that empty list elements are not
+  # ignored.  I'm just setting it to avoid the warning that's printed.
+  cmake_policy(SET CMP0007 NEW)
+  # Note that this will remove all occurances of -G.
+  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
+  cmake_policy(POP)
+endif()
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invokation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
new file mode 100644
index 00000000..8e3a0be1
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
@@ -0,0 +1 @@
+#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/test/CMakeFiles/progress.marks b/src/caffe/test/CMakeFiles/progress.marks
new file mode 100644
index 00000000..573541ac
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+0
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
new file mode 100644
index 00000000..f660fadf
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
@@ -0,0 +1,27 @@
+# The set of languages for which implicit dependencies are needed:
+SET(CMAKE_DEPENDS_LANGUAGES
+  )
+# The set of files for implicit dependencies of each language:
+
+# Preprocessor definitions for this target.
+SET(CMAKE_TARGET_DEFINITIONS
+  "GTEST_USE_OWN_TR1_TUPLE"
+  )
+
+# Targets to which this target links.
+SET(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# The include file search paths:
+SET(CMAKE_C_TARGET_INCLUDE_PATH
+  "src"
+  "/usr/local/include"
+  "include"
+  "/usr/local/cuda/include"
+  "/usr/local/include/opencv"
+  "/usr/include/atlas"
+  "."
+  )
+SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/build.make b/src/caffe/test/CMakeFiles/runtest.dir/build.make
new file mode 100644
index 00000000..7ccc5279
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/runtest.dir/build.make
@@ -0,0 +1,69 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# Utility rule file for runtest.
+
+# Include the progress variables for this target.
+include src/caffe/test/CMakeFiles/runtest.dir/progress.make
+
+src/caffe/test/CMakeFiles/runtest:
+	/home/yugao/caffe-merge-junli/caffe-yb/caffe/test/test.testbin --gtest_shuffle
+
+runtest: src/caffe/test/CMakeFiles/runtest
+runtest: src/caffe/test/CMakeFiles/runtest.dir/build.make
+.PHONY : runtest
+
+# Rule to build all files generated by this target.
+src/caffe/test/CMakeFiles/runtest.dir/build: runtest
+.PHONY : src/caffe/test/CMakeFiles/runtest.dir/build
+
+src/caffe/test/CMakeFiles/runtest.dir/clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/runtest.dir/cmake_clean.cmake
+.PHONY : src/caffe/test/CMakeFiles/runtest.dir/clean
+
+src/caffe/test/CMakeFiles/runtest.dir/depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/caffe/test/CMakeFiles/runtest.dir/depend
+
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
new file mode 100644
index 00000000..ed560e60
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
@@ -0,0 +1,8 @@
+FILE(REMOVE_RECURSE
+  "CMakeFiles/runtest"
+)
+
+# Per-language clean rules from dependency scanning.
+FOREACH(lang)
+  INCLUDE(CMakeFiles/runtest.dir/cmake_clean_${lang}.cmake OPTIONAL)
+ENDFOREACH(lang)
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/progress.make b/src/caffe/test/CMakeFiles/runtest.dir/progress.make
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/runtest.dir/progress.make
@@ -0,0 +1 @@
+
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
new file mode 100644
index 00000000..d4748b21
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
@@ -0,0 +1,92 @@
+# The set of languages for which implicit dependencies are needed:
+SET(CMAKE_DEPENDS_LANGUAGES
+  "CXX"
+  )
+# The set of files for implicit dependencies of each language:
+SET(CMAKE_DEPENDS_CHECK_CXX
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
+  )
+SET(CMAKE_CXX_COMPILER_ID "GNU")
+
+# Preprocessor definitions for this target.
+SET(CMAKE_TARGET_DEFINITIONS
+  "GTEST_USE_OWN_TR1_TUPLE"
+  )
+
+# Targets to which this target links.
+SET(CMAKE_TARGET_LINKED_INFO_FILES
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake"
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake"
+  )
+
+# The include file search paths:
+SET(CMAKE_C_TARGET_INCLUDE_PATH
+  "src"
+  "/usr/local/include"
+  "include"
+  "/usr/local/cuda/include"
+  "/usr/local/include/opencv"
+  "/usr/include/atlas"
+  "."
+  )
+SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make
new file mode 100644
index 00000000..c67def36
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make
@@ -0,0 +1,1623 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# Include any dependencies generated for this target.
+include src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
+
+# Include the progress variables for this target.
+include src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+
+src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
+src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
+src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/test_im2col_kernel.cu
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.cmake
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/test_spp_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp > CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/test_filler.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filler.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filler.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp > CMakeFiles/test.testbin.dir/test_filler.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filler.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp -o CMakeFiles/test.testbin.dir/test_filler.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/test_im2col_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp > CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/test_common.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_common.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp > CMakeFiles/test.testbin.dir/test_common.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_common.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp -o CMakeFiles/test.testbin.dir/test_common.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/test_infogain_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/test_math_functions.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_math_functions.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp > CMakeFiles/test.testbin.dir/test_math_functions.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_math_functions.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/test_euclidean_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/test_split_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_split_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp > CMakeFiles/test.testbin.dir/test_split_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_split_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/test_reshape_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp > CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/test_random_number_generator.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp > CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/test_lrn_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp > CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/test_gradient_based_solver.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp > CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/test_upgrade_proto.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp > CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/test_io.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_io.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp > CMakeFiles/test.testbin.dir/test_io.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_io.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp -o CMakeFiles/test.testbin.dir/test_io.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/test_accuracy_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp > CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/test_caffe_main.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp > CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/test_net.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_net.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp > CMakeFiles/test.testbin.dir/test_net.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_net.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp -o CMakeFiles/test.testbin.dir/test_net.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/test_filter_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp > CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/test_power_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_power_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp > CMakeFiles/test.testbin.dir/test_power_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_power_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/test_softmax_with_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/test_argmax_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp > CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/test_solver.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_solver.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp > CMakeFiles/test.testbin.dir/test_solver.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_solver.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp -o CMakeFiles/test.testbin.dir/test_solver.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/test_blob.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_blob.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp > CMakeFiles/test.testbin.dir/test_blob.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_blob.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp -o CMakeFiles/test.testbin.dir/test_blob.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/test_benchmark.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_benchmark.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp > CMakeFiles/test.testbin.dir/test_benchmark.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_benchmark.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/test_util_blas.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_util_blas.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp > CMakeFiles/test.testbin.dir/test_util_blas.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_util_blas.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/test_internal_thread.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp > CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/test_reduction_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp > CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/test_contrastive_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/test_eltwise_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp > CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/test_maxpool_dropout_layers.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp > CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/test_threshold_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp > CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/test_pooling_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp > CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/test_softmax_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/test_inner_product_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp > CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/test_flatten_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp > CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/test_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp > CMakeFiles/test.testbin.dir/test_data_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/test_syncedmem.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp > CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/test_hdf5data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/test_deconvolution_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp > CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/test_neuron_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp > CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/test_concat_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp > CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/test_protobuf.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_protobuf.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp > CMakeFiles/test.testbin.dir/test_protobuf.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_protobuf.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/test_hdf5_output_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/test_memory_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp > CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/test_tanh_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp > CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/test_stochastic_pooling.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp > CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/test_dummy_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp > CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/test_layer_factory.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp > CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/test_db.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_db.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp > CMakeFiles/test.testbin.dir/test_db.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_db.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp -o CMakeFiles/test.testbin.dir/test_db.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/test_mvn_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp > CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/test_convolution_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp > CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/test_slice_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp > CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/test_hinge_loss_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/test_image_data_layer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp > CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/test_platform.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_platform.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_platform.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp > CMakeFiles/test.testbin.dir/test_platform.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_platform.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp -o CMakeFiles/test.testbin.dir/test_platform.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/test_data_transformer.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp > CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires:
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
+	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides
+
+src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
+
+# Object files for target test.testbin
+test_testbin_OBJECTS = \
+"CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_filler.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_common.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_io.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_net.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_solver.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_blob.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_db.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_platform.cpp.o" \
+"CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
+
+# External object files for target test.testbin
+test_testbin_EXTERNAL_OBJECTS = \
+"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o"
+
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
+test/test.testbin: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/build.make
+test/test.testbin: lib/libgtest.a
+test/test.testbin: lib/libcaffe.so
+test/test.testbin: lib/libproto.a
+test/test.testbin: /usr/local/lib/libboost_system.so
+test/test.testbin: /usr/local/lib/libboost_thread.so
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libpthread.so
+test/test.testbin: /usr/local/lib/libglog.so
+test/test.testbin: /usr/local/lib/libgflags.a
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so
+test/test.testbin: /usr/local/lib/libglog.so
+test/test.testbin: /usr/local/lib/libgflags.a
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5.so
+test/test.testbin: /usr/local/lib/liblmdb.so
+test/test.testbin: /usr/lib/x86_64-linux-gnu/libleveldb.so
+test/test.testbin: /usr/lib/libsnappy.so
+test/test.testbin: /usr/local/cuda/lib64/libcudart.so
+test/test.testbin: /usr/local/cuda/lib64/libcurand.so
+test/test.testbin: /usr/local/cuda/lib64/libcublas.so
+test/test.testbin: /usr/local/lib/libopencv_highgui.so.2.4.10
+test/test.testbin: /usr/local/lib/libopencv_imgproc.so.2.4.10
+test/test.testbin: /usr/local/lib/libopencv_core.so.2.4.10
+test/test.testbin: /usr/lib/liblapack_atlas.so
+test/test.testbin: /usr/lib/libcblas.so
+test/test.testbin: /usr/lib/libatlas.so
+test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX executable ../../../test/test.testbin"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/test.testbin.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+src/caffe/test/CMakeFiles/test.testbin.dir/build: test/test.testbin
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/build
+
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
+src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/requires
+
+src/caffe/test/CMakeFiles/test.testbin.dir/clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/test.testbin.dir/cmake_clean.cmake
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/clean
+
+src/caffe/test/CMakeFiles/test.testbin.dir/depend: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/depend
+
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
new file mode 100644
index 00000000..3270b673
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
@@ -0,0 +1,68 @@
+FILE(REMOVE_RECURSE
+  "CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o"
+  "CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_filler.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_common.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_io.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_net.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_solver.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_blob.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_db.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_platform.cpp.o"
+  "CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
+  "../../../test/test.testbin.pdb"
+  "../../../test/test.testbin"
+)
+
+# Per-language clean rules from dependency scanning.
+FOREACH(lang CXX)
+  INCLUDE(CMakeFiles/test.testbin.dir/cmake_clean_${lang}.cmake OPTIONAL)
+ENDFOREACH(lang)
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
new file mode 100644
index 00000000..e3607644
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for test.testbin.
+# This may be replaced when dependencies are built.
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
new file mode 100644
index 00000000..8b4ef992
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
@@ -0,0 +1,8 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# compile CXX with /usr/bin/c++
+CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
+
+CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
+
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
new file mode 100644
index 00000000..35426fa4
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
@@ -0,0 +1 @@
+/usr/bin/c++    -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG    CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o CMakeFiles/test.testbin.dir/test_filler.cpp.o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o CMakeFiles/test.testbin.dir/test_common.cpp.o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o CMakeFiles/test.testbin.dir/test_io.cpp.o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o CMakeFiles/test.testbin.dir/test_net.cpp.o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_solver.cpp.o CMakeFiles/test.testbin.dir/test_blob.cpp.o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o CMakeFiles/test.testbin.dir/test_db.cpp.o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_platform.cpp.o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o  -o ../../../test/test.testbin  -L/usr/local/cuda/lib64  -L/usr/local/lib -rdynamic ../../../lib/libgtest.a -Wl,--whole-archive ../../../lib/libcaffe.so -Wl,--no-whole-archive ../../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 /usr/local/lib/libopencv_core.so.2.4.10 -llapack_atlas -lcblas -latlas -Wl,-rpath,/usr/local/cuda/lib64:/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib:/usr/local/lib 
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
new file mode 100644
index 00000000..9de70a55
--- /dev/null
+++ b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
@@ -0,0 +1,60 @@
+CMAKE_PROGRESS_1 = 
+CMAKE_PROGRESS_2 = 69
+CMAKE_PROGRESS_3 = 
+CMAKE_PROGRESS_4 = 70
+CMAKE_PROGRESS_5 = 
+CMAKE_PROGRESS_6 = 71
+CMAKE_PROGRESS_7 = 
+CMAKE_PROGRESS_8 = 72
+CMAKE_PROGRESS_9 = 
+CMAKE_PROGRESS_10 = 73
+CMAKE_PROGRESS_11 = 
+CMAKE_PROGRESS_12 = 74
+CMAKE_PROGRESS_13 = 
+CMAKE_PROGRESS_14 = 75
+CMAKE_PROGRESS_15 = 
+CMAKE_PROGRESS_16 = 76
+CMAKE_PROGRESS_17 = 
+CMAKE_PROGRESS_18 = 77
+CMAKE_PROGRESS_19 = 
+CMAKE_PROGRESS_20 = 78
+CMAKE_PROGRESS_21 = 
+CMAKE_PROGRESS_22 = 79
+CMAKE_PROGRESS_23 = 
+CMAKE_PROGRESS_24 = 80
+CMAKE_PROGRESS_25 = 
+CMAKE_PROGRESS_26 = 81
+CMAKE_PROGRESS_27 = 
+CMAKE_PROGRESS_28 = 82
+CMAKE_PROGRESS_29 = 
+CMAKE_PROGRESS_30 = 83
+CMAKE_PROGRESS_31 = 
+CMAKE_PROGRESS_32 = 84
+CMAKE_PROGRESS_33 = 
+CMAKE_PROGRESS_34 = 85
+CMAKE_PROGRESS_35 = 
+CMAKE_PROGRESS_36 = 86
+CMAKE_PROGRESS_37 = 
+CMAKE_PROGRESS_38 = 87
+CMAKE_PROGRESS_39 = 
+CMAKE_PROGRESS_40 = 88
+CMAKE_PROGRESS_41 = 
+CMAKE_PROGRESS_42 = 89
+CMAKE_PROGRESS_43 = 
+CMAKE_PROGRESS_44 = 90
+CMAKE_PROGRESS_45 = 
+CMAKE_PROGRESS_46 = 91
+CMAKE_PROGRESS_47 = 
+CMAKE_PROGRESS_48 = 92
+CMAKE_PROGRESS_49 = 
+CMAKE_PROGRESS_50 = 93
+CMAKE_PROGRESS_51 = 
+CMAKE_PROGRESS_52 = 94
+CMAKE_PROGRESS_53 = 
+CMAKE_PROGRESS_54 = 95
+CMAKE_PROGRESS_55 = 
+CMAKE_PROGRESS_56 = 96
+CMAKE_PROGRESS_57 = 
+CMAKE_PROGRESS_58 = 97
+CMAKE_PROGRESS_59 = 
+
diff --git a/src/caffe/test/Makefile b/src/caffe/test/Makefile
new file mode 100644
index 00000000..c9e785c7
--- /dev/null
+++ b/src/caffe/test/Makefile
@@ -0,0 +1,1766 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: install/local
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: install/strip
+.PHONY : install/strip/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/progress.marks
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/caffe/test/CMakeFiles/runtest.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/runtest.dir/rule
+.PHONY : src/caffe/test/CMakeFiles/runtest.dir/rule
+
+# Convenience name for target.
+runtest: src/caffe/test/CMakeFiles/runtest.dir/rule
+.PHONY : runtest
+
+# fast build rule for target.
+runtest/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/runtest.dir/build.make src/caffe/test/CMakeFiles/runtest.dir/build
+.PHONY : runtest/fast
+
+# Convenience name for target.
+src/caffe/test/CMakeFiles/test.testbin.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/test.testbin.dir/rule
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/rule
+
+# Convenience name for target.
+test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/rule
+.PHONY : test.testbin
+
+# fast build rule for target.
+test.testbin/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/build
+.PHONY : test.testbin/fast
+
+test_accuracy_layer.o: test_accuracy_layer.cpp.o
+.PHONY : test_accuracy_layer.o
+
+# target to build an object file
+test_accuracy_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
+.PHONY : test_accuracy_layer.cpp.o
+
+test_accuracy_layer.i: test_accuracy_layer.cpp.i
+.PHONY : test_accuracy_layer.i
+
+# target to preprocess a source file
+test_accuracy_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i
+.PHONY : test_accuracy_layer.cpp.i
+
+test_accuracy_layer.s: test_accuracy_layer.cpp.s
+.PHONY : test_accuracy_layer.s
+
+# target to generate assembly for a file
+test_accuracy_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s
+.PHONY : test_accuracy_layer.cpp.s
+
+test_argmax_layer.o: test_argmax_layer.cpp.o
+.PHONY : test_argmax_layer.o
+
+# target to build an object file
+test_argmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
+.PHONY : test_argmax_layer.cpp.o
+
+test_argmax_layer.i: test_argmax_layer.cpp.i
+.PHONY : test_argmax_layer.i
+
+# target to preprocess a source file
+test_argmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i
+.PHONY : test_argmax_layer.cpp.i
+
+test_argmax_layer.s: test_argmax_layer.cpp.s
+.PHONY : test_argmax_layer.s
+
+# target to generate assembly for a file
+test_argmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s
+.PHONY : test_argmax_layer.cpp.s
+
+test_benchmark.o: test_benchmark.cpp.o
+.PHONY : test_benchmark.o
+
+# target to build an object file
+test_benchmark.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
+.PHONY : test_benchmark.cpp.o
+
+test_benchmark.i: test_benchmark.cpp.i
+.PHONY : test_benchmark.i
+
+# target to preprocess a source file
+test_benchmark.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i
+.PHONY : test_benchmark.cpp.i
+
+test_benchmark.s: test_benchmark.cpp.s
+.PHONY : test_benchmark.s
+
+# target to generate assembly for a file
+test_benchmark.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s
+.PHONY : test_benchmark.cpp.s
+
+test_blob.o: test_blob.cpp.o
+.PHONY : test_blob.o
+
+# target to build an object file
+test_blob.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
+.PHONY : test_blob.cpp.o
+
+test_blob.i: test_blob.cpp.i
+.PHONY : test_blob.i
+
+# target to preprocess a source file
+test_blob.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i
+.PHONY : test_blob.cpp.i
+
+test_blob.s: test_blob.cpp.s
+.PHONY : test_blob.s
+
+# target to generate assembly for a file
+test_blob.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s
+.PHONY : test_blob.cpp.s
+
+test_caffe_main.o: test_caffe_main.cpp.o
+.PHONY : test_caffe_main.o
+
+# target to build an object file
+test_caffe_main.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
+.PHONY : test_caffe_main.cpp.o
+
+test_caffe_main.i: test_caffe_main.cpp.i
+.PHONY : test_caffe_main.i
+
+# target to preprocess a source file
+test_caffe_main.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i
+.PHONY : test_caffe_main.cpp.i
+
+test_caffe_main.s: test_caffe_main.cpp.s
+.PHONY : test_caffe_main.s
+
+# target to generate assembly for a file
+test_caffe_main.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s
+.PHONY : test_caffe_main.cpp.s
+
+test_common.o: test_common.cpp.o
+.PHONY : test_common.o
+
+# target to build an object file
+test_common.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
+.PHONY : test_common.cpp.o
+
+test_common.i: test_common.cpp.i
+.PHONY : test_common.i
+
+# target to preprocess a source file
+test_common.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i
+.PHONY : test_common.cpp.i
+
+test_common.s: test_common.cpp.s
+.PHONY : test_common.s
+
+# target to generate assembly for a file
+test_common.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s
+.PHONY : test_common.cpp.s
+
+test_concat_layer.o: test_concat_layer.cpp.o
+.PHONY : test_concat_layer.o
+
+# target to build an object file
+test_concat_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
+.PHONY : test_concat_layer.cpp.o
+
+test_concat_layer.i: test_concat_layer.cpp.i
+.PHONY : test_concat_layer.i
+
+# target to preprocess a source file
+test_concat_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i
+.PHONY : test_concat_layer.cpp.i
+
+test_concat_layer.s: test_concat_layer.cpp.s
+.PHONY : test_concat_layer.s
+
+# target to generate assembly for a file
+test_concat_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s
+.PHONY : test_concat_layer.cpp.s
+
+test_contrastive_loss_layer.o: test_contrastive_loss_layer.cpp.o
+.PHONY : test_contrastive_loss_layer.o
+
+# target to build an object file
+test_contrastive_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
+.PHONY : test_contrastive_loss_layer.cpp.o
+
+test_contrastive_loss_layer.i: test_contrastive_loss_layer.cpp.i
+.PHONY : test_contrastive_loss_layer.i
+
+# target to preprocess a source file
+test_contrastive_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i
+.PHONY : test_contrastive_loss_layer.cpp.i
+
+test_contrastive_loss_layer.s: test_contrastive_loss_layer.cpp.s
+.PHONY : test_contrastive_loss_layer.s
+
+# target to generate assembly for a file
+test_contrastive_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s
+.PHONY : test_contrastive_loss_layer.cpp.s
+
+test_convolution_layer.o: test_convolution_layer.cpp.o
+.PHONY : test_convolution_layer.o
+
+# target to build an object file
+test_convolution_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
+.PHONY : test_convolution_layer.cpp.o
+
+test_convolution_layer.i: test_convolution_layer.cpp.i
+.PHONY : test_convolution_layer.i
+
+# target to preprocess a source file
+test_convolution_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i
+.PHONY : test_convolution_layer.cpp.i
+
+test_convolution_layer.s: test_convolution_layer.cpp.s
+.PHONY : test_convolution_layer.s
+
+# target to generate assembly for a file
+test_convolution_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s
+.PHONY : test_convolution_layer.cpp.s
+
+test_data_layer.o: test_data_layer.cpp.o
+.PHONY : test_data_layer.o
+
+# target to build an object file
+test_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
+.PHONY : test_data_layer.cpp.o
+
+test_data_layer.i: test_data_layer.cpp.i
+.PHONY : test_data_layer.i
+
+# target to preprocess a source file
+test_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i
+.PHONY : test_data_layer.cpp.i
+
+test_data_layer.s: test_data_layer.cpp.s
+.PHONY : test_data_layer.s
+
+# target to generate assembly for a file
+test_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s
+.PHONY : test_data_layer.cpp.s
+
+test_data_transformer.o: test_data_transformer.cpp.o
+.PHONY : test_data_transformer.o
+
+# target to build an object file
+test_data_transformer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
+.PHONY : test_data_transformer.cpp.o
+
+test_data_transformer.i: test_data_transformer.cpp.i
+.PHONY : test_data_transformer.i
+
+# target to preprocess a source file
+test_data_transformer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i
+.PHONY : test_data_transformer.cpp.i
+
+test_data_transformer.s: test_data_transformer.cpp.s
+.PHONY : test_data_transformer.s
+
+# target to generate assembly for a file
+test_data_transformer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s
+.PHONY : test_data_transformer.cpp.s
+
+test_db.o: test_db.cpp.o
+.PHONY : test_db.o
+
+# target to build an object file
+test_db.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
+.PHONY : test_db.cpp.o
+
+test_db.i: test_db.cpp.i
+.PHONY : test_db.i
+
+# target to preprocess a source file
+test_db.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i
+.PHONY : test_db.cpp.i
+
+test_db.s: test_db.cpp.s
+.PHONY : test_db.s
+
+# target to generate assembly for a file
+test_db.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s
+.PHONY : test_db.cpp.s
+
+test_deconvolution_layer.o: test_deconvolution_layer.cpp.o
+.PHONY : test_deconvolution_layer.o
+
+# target to build an object file
+test_deconvolution_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
+.PHONY : test_deconvolution_layer.cpp.o
+
+test_deconvolution_layer.i: test_deconvolution_layer.cpp.i
+.PHONY : test_deconvolution_layer.i
+
+# target to preprocess a source file
+test_deconvolution_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i
+.PHONY : test_deconvolution_layer.cpp.i
+
+test_deconvolution_layer.s: test_deconvolution_layer.cpp.s
+.PHONY : test_deconvolution_layer.s
+
+# target to generate assembly for a file
+test_deconvolution_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s
+.PHONY : test_deconvolution_layer.cpp.s
+
+test_dummy_data_layer.o: test_dummy_data_layer.cpp.o
+.PHONY : test_dummy_data_layer.o
+
+# target to build an object file
+test_dummy_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
+.PHONY : test_dummy_data_layer.cpp.o
+
+test_dummy_data_layer.i: test_dummy_data_layer.cpp.i
+.PHONY : test_dummy_data_layer.i
+
+# target to preprocess a source file
+test_dummy_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i
+.PHONY : test_dummy_data_layer.cpp.i
+
+test_dummy_data_layer.s: test_dummy_data_layer.cpp.s
+.PHONY : test_dummy_data_layer.s
+
+# target to generate assembly for a file
+test_dummy_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s
+.PHONY : test_dummy_data_layer.cpp.s
+
+test_eltwise_layer.o: test_eltwise_layer.cpp.o
+.PHONY : test_eltwise_layer.o
+
+# target to build an object file
+test_eltwise_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
+.PHONY : test_eltwise_layer.cpp.o
+
+test_eltwise_layer.i: test_eltwise_layer.cpp.i
+.PHONY : test_eltwise_layer.i
+
+# target to preprocess a source file
+test_eltwise_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i
+.PHONY : test_eltwise_layer.cpp.i
+
+test_eltwise_layer.s: test_eltwise_layer.cpp.s
+.PHONY : test_eltwise_layer.s
+
+# target to generate assembly for a file
+test_eltwise_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s
+.PHONY : test_eltwise_layer.cpp.s
+
+test_euclidean_loss_layer.o: test_euclidean_loss_layer.cpp.o
+.PHONY : test_euclidean_loss_layer.o
+
+# target to build an object file
+test_euclidean_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
+.PHONY : test_euclidean_loss_layer.cpp.o
+
+test_euclidean_loss_layer.i: test_euclidean_loss_layer.cpp.i
+.PHONY : test_euclidean_loss_layer.i
+
+# target to preprocess a source file
+test_euclidean_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i
+.PHONY : test_euclidean_loss_layer.cpp.i
+
+test_euclidean_loss_layer.s: test_euclidean_loss_layer.cpp.s
+.PHONY : test_euclidean_loss_layer.s
+
+# target to generate assembly for a file
+test_euclidean_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s
+.PHONY : test_euclidean_loss_layer.cpp.s
+
+test_filler.o: test_filler.cpp.o
+.PHONY : test_filler.o
+
+# target to build an object file
+test_filler.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
+.PHONY : test_filler.cpp.o
+
+test_filler.i: test_filler.cpp.i
+.PHONY : test_filler.i
+
+# target to preprocess a source file
+test_filler.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i
+.PHONY : test_filler.cpp.i
+
+test_filler.s: test_filler.cpp.s
+.PHONY : test_filler.s
+
+# target to generate assembly for a file
+test_filler.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s
+.PHONY : test_filler.cpp.s
+
+test_filter_layer.o: test_filter_layer.cpp.o
+.PHONY : test_filter_layer.o
+
+# target to build an object file
+test_filter_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
+.PHONY : test_filter_layer.cpp.o
+
+test_filter_layer.i: test_filter_layer.cpp.i
+.PHONY : test_filter_layer.i
+
+# target to preprocess a source file
+test_filter_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i
+.PHONY : test_filter_layer.cpp.i
+
+test_filter_layer.s: test_filter_layer.cpp.s
+.PHONY : test_filter_layer.s
+
+# target to generate assembly for a file
+test_filter_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s
+.PHONY : test_filter_layer.cpp.s
+
+test_flatten_layer.o: test_flatten_layer.cpp.o
+.PHONY : test_flatten_layer.o
+
+# target to build an object file
+test_flatten_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
+.PHONY : test_flatten_layer.cpp.o
+
+test_flatten_layer.i: test_flatten_layer.cpp.i
+.PHONY : test_flatten_layer.i
+
+# target to preprocess a source file
+test_flatten_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i
+.PHONY : test_flatten_layer.cpp.i
+
+test_flatten_layer.s: test_flatten_layer.cpp.s
+.PHONY : test_flatten_layer.s
+
+# target to generate assembly for a file
+test_flatten_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s
+.PHONY : test_flatten_layer.cpp.s
+
+test_gradient_based_solver.o: test_gradient_based_solver.cpp.o
+.PHONY : test_gradient_based_solver.o
+
+# target to build an object file
+test_gradient_based_solver.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
+.PHONY : test_gradient_based_solver.cpp.o
+
+test_gradient_based_solver.i: test_gradient_based_solver.cpp.i
+.PHONY : test_gradient_based_solver.i
+
+# target to preprocess a source file
+test_gradient_based_solver.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i
+.PHONY : test_gradient_based_solver.cpp.i
+
+test_gradient_based_solver.s: test_gradient_based_solver.cpp.s
+.PHONY : test_gradient_based_solver.s
+
+# target to generate assembly for a file
+test_gradient_based_solver.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s
+.PHONY : test_gradient_based_solver.cpp.s
+
+test_hdf5_output_layer.o: test_hdf5_output_layer.cpp.o
+.PHONY : test_hdf5_output_layer.o
+
+# target to build an object file
+test_hdf5_output_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
+.PHONY : test_hdf5_output_layer.cpp.o
+
+test_hdf5_output_layer.i: test_hdf5_output_layer.cpp.i
+.PHONY : test_hdf5_output_layer.i
+
+# target to preprocess a source file
+test_hdf5_output_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i
+.PHONY : test_hdf5_output_layer.cpp.i
+
+test_hdf5_output_layer.s: test_hdf5_output_layer.cpp.s
+.PHONY : test_hdf5_output_layer.s
+
+# target to generate assembly for a file
+test_hdf5_output_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s
+.PHONY : test_hdf5_output_layer.cpp.s
+
+test_hdf5data_layer.o: test_hdf5data_layer.cpp.o
+.PHONY : test_hdf5data_layer.o
+
+# target to build an object file
+test_hdf5data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
+.PHONY : test_hdf5data_layer.cpp.o
+
+test_hdf5data_layer.i: test_hdf5data_layer.cpp.i
+.PHONY : test_hdf5data_layer.i
+
+# target to preprocess a source file
+test_hdf5data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i
+.PHONY : test_hdf5data_layer.cpp.i
+
+test_hdf5data_layer.s: test_hdf5data_layer.cpp.s
+.PHONY : test_hdf5data_layer.s
+
+# target to generate assembly for a file
+test_hdf5data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s
+.PHONY : test_hdf5data_layer.cpp.s
+
+test_hinge_loss_layer.o: test_hinge_loss_layer.cpp.o
+.PHONY : test_hinge_loss_layer.o
+
+# target to build an object file
+test_hinge_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
+.PHONY : test_hinge_loss_layer.cpp.o
+
+test_hinge_loss_layer.i: test_hinge_loss_layer.cpp.i
+.PHONY : test_hinge_loss_layer.i
+
+# target to preprocess a source file
+test_hinge_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i
+.PHONY : test_hinge_loss_layer.cpp.i
+
+test_hinge_loss_layer.s: test_hinge_loss_layer.cpp.s
+.PHONY : test_hinge_loss_layer.s
+
+# target to generate assembly for a file
+test_hinge_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s
+.PHONY : test_hinge_loss_layer.cpp.s
+
+test_im2col_layer.o: test_im2col_layer.cpp.o
+.PHONY : test_im2col_layer.o
+
+# target to build an object file
+test_im2col_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
+.PHONY : test_im2col_layer.cpp.o
+
+test_im2col_layer.i: test_im2col_layer.cpp.i
+.PHONY : test_im2col_layer.i
+
+# target to preprocess a source file
+test_im2col_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i
+.PHONY : test_im2col_layer.cpp.i
+
+test_im2col_layer.s: test_im2col_layer.cpp.s
+.PHONY : test_im2col_layer.s
+
+# target to generate assembly for a file
+test_im2col_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s
+.PHONY : test_im2col_layer.cpp.s
+
+test_image_data_layer.o: test_image_data_layer.cpp.o
+.PHONY : test_image_data_layer.o
+
+# target to build an object file
+test_image_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
+.PHONY : test_image_data_layer.cpp.o
+
+test_image_data_layer.i: test_image_data_layer.cpp.i
+.PHONY : test_image_data_layer.i
+
+# target to preprocess a source file
+test_image_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i
+.PHONY : test_image_data_layer.cpp.i
+
+test_image_data_layer.s: test_image_data_layer.cpp.s
+.PHONY : test_image_data_layer.s
+
+# target to generate assembly for a file
+test_image_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s
+.PHONY : test_image_data_layer.cpp.s
+
+test_infogain_loss_layer.o: test_infogain_loss_layer.cpp.o
+.PHONY : test_infogain_loss_layer.o
+
+# target to build an object file
+test_infogain_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
+.PHONY : test_infogain_loss_layer.cpp.o
+
+test_infogain_loss_layer.i: test_infogain_loss_layer.cpp.i
+.PHONY : test_infogain_loss_layer.i
+
+# target to preprocess a source file
+test_infogain_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i
+.PHONY : test_infogain_loss_layer.cpp.i
+
+test_infogain_loss_layer.s: test_infogain_loss_layer.cpp.s
+.PHONY : test_infogain_loss_layer.s
+
+# target to generate assembly for a file
+test_infogain_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s
+.PHONY : test_infogain_loss_layer.cpp.s
+
+test_inner_product_layer.o: test_inner_product_layer.cpp.o
+.PHONY : test_inner_product_layer.o
+
+# target to build an object file
+test_inner_product_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
+.PHONY : test_inner_product_layer.cpp.o
+
+test_inner_product_layer.i: test_inner_product_layer.cpp.i
+.PHONY : test_inner_product_layer.i
+
+# target to preprocess a source file
+test_inner_product_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i
+.PHONY : test_inner_product_layer.cpp.i
+
+test_inner_product_layer.s: test_inner_product_layer.cpp.s
+.PHONY : test_inner_product_layer.s
+
+# target to generate assembly for a file
+test_inner_product_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s
+.PHONY : test_inner_product_layer.cpp.s
+
+test_internal_thread.o: test_internal_thread.cpp.o
+.PHONY : test_internal_thread.o
+
+# target to build an object file
+test_internal_thread.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
+.PHONY : test_internal_thread.cpp.o
+
+test_internal_thread.i: test_internal_thread.cpp.i
+.PHONY : test_internal_thread.i
+
+# target to preprocess a source file
+test_internal_thread.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i
+.PHONY : test_internal_thread.cpp.i
+
+test_internal_thread.s: test_internal_thread.cpp.s
+.PHONY : test_internal_thread.s
+
+# target to generate assembly for a file
+test_internal_thread.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s
+.PHONY : test_internal_thread.cpp.s
+
+test_io.o: test_io.cpp.o
+.PHONY : test_io.o
+
+# target to build an object file
+test_io.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
+.PHONY : test_io.cpp.o
+
+test_io.i: test_io.cpp.i
+.PHONY : test_io.i
+
+# target to preprocess a source file
+test_io.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i
+.PHONY : test_io.cpp.i
+
+test_io.s: test_io.cpp.s
+.PHONY : test_io.s
+
+# target to generate assembly for a file
+test_io.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s
+.PHONY : test_io.cpp.s
+
+test_layer_factory.o: test_layer_factory.cpp.o
+.PHONY : test_layer_factory.o
+
+# target to build an object file
+test_layer_factory.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
+.PHONY : test_layer_factory.cpp.o
+
+test_layer_factory.i: test_layer_factory.cpp.i
+.PHONY : test_layer_factory.i
+
+# target to preprocess a source file
+test_layer_factory.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i
+.PHONY : test_layer_factory.cpp.i
+
+test_layer_factory.s: test_layer_factory.cpp.s
+.PHONY : test_layer_factory.s
+
+# target to generate assembly for a file
+test_layer_factory.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s
+.PHONY : test_layer_factory.cpp.s
+
+test_lrn_layer.o: test_lrn_layer.cpp.o
+.PHONY : test_lrn_layer.o
+
+# target to build an object file
+test_lrn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
+.PHONY : test_lrn_layer.cpp.o
+
+test_lrn_layer.i: test_lrn_layer.cpp.i
+.PHONY : test_lrn_layer.i
+
+# target to preprocess a source file
+test_lrn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i
+.PHONY : test_lrn_layer.cpp.i
+
+test_lrn_layer.s: test_lrn_layer.cpp.s
+.PHONY : test_lrn_layer.s
+
+# target to generate assembly for a file
+test_lrn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s
+.PHONY : test_lrn_layer.cpp.s
+
+test_math_functions.o: test_math_functions.cpp.o
+.PHONY : test_math_functions.o
+
+# target to build an object file
+test_math_functions.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
+.PHONY : test_math_functions.cpp.o
+
+test_math_functions.i: test_math_functions.cpp.i
+.PHONY : test_math_functions.i
+
+# target to preprocess a source file
+test_math_functions.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i
+.PHONY : test_math_functions.cpp.i
+
+test_math_functions.s: test_math_functions.cpp.s
+.PHONY : test_math_functions.s
+
+# target to generate assembly for a file
+test_math_functions.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s
+.PHONY : test_math_functions.cpp.s
+
+test_maxpool_dropout_layers.o: test_maxpool_dropout_layers.cpp.o
+.PHONY : test_maxpool_dropout_layers.o
+
+# target to build an object file
+test_maxpool_dropout_layers.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
+.PHONY : test_maxpool_dropout_layers.cpp.o
+
+test_maxpool_dropout_layers.i: test_maxpool_dropout_layers.cpp.i
+.PHONY : test_maxpool_dropout_layers.i
+
+# target to preprocess a source file
+test_maxpool_dropout_layers.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i
+.PHONY : test_maxpool_dropout_layers.cpp.i
+
+test_maxpool_dropout_layers.s: test_maxpool_dropout_layers.cpp.s
+.PHONY : test_maxpool_dropout_layers.s
+
+# target to generate assembly for a file
+test_maxpool_dropout_layers.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s
+.PHONY : test_maxpool_dropout_layers.cpp.s
+
+test_memory_data_layer.o: test_memory_data_layer.cpp.o
+.PHONY : test_memory_data_layer.o
+
+# target to build an object file
+test_memory_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
+.PHONY : test_memory_data_layer.cpp.o
+
+test_memory_data_layer.i: test_memory_data_layer.cpp.i
+.PHONY : test_memory_data_layer.i
+
+# target to preprocess a source file
+test_memory_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i
+.PHONY : test_memory_data_layer.cpp.i
+
+test_memory_data_layer.s: test_memory_data_layer.cpp.s
+.PHONY : test_memory_data_layer.s
+
+# target to generate assembly for a file
+test_memory_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s
+.PHONY : test_memory_data_layer.cpp.s
+
+test_multinomial_logistic_loss_layer.o: test_multinomial_logistic_loss_layer.cpp.o
+.PHONY : test_multinomial_logistic_loss_layer.o
+
+# target to build an object file
+test_multinomial_logistic_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
+.PHONY : test_multinomial_logistic_loss_layer.cpp.o
+
+test_multinomial_logistic_loss_layer.i: test_multinomial_logistic_loss_layer.cpp.i
+.PHONY : test_multinomial_logistic_loss_layer.i
+
+# target to preprocess a source file
+test_multinomial_logistic_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i
+.PHONY : test_multinomial_logistic_loss_layer.cpp.i
+
+test_multinomial_logistic_loss_layer.s: test_multinomial_logistic_loss_layer.cpp.s
+.PHONY : test_multinomial_logistic_loss_layer.s
+
+# target to generate assembly for a file
+test_multinomial_logistic_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s
+.PHONY : test_multinomial_logistic_loss_layer.cpp.s
+
+test_mvn_layer.o: test_mvn_layer.cpp.o
+.PHONY : test_mvn_layer.o
+
+# target to build an object file
+test_mvn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
+.PHONY : test_mvn_layer.cpp.o
+
+test_mvn_layer.i: test_mvn_layer.cpp.i
+.PHONY : test_mvn_layer.i
+
+# target to preprocess a source file
+test_mvn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i
+.PHONY : test_mvn_layer.cpp.i
+
+test_mvn_layer.s: test_mvn_layer.cpp.s
+.PHONY : test_mvn_layer.s
+
+# target to generate assembly for a file
+test_mvn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s
+.PHONY : test_mvn_layer.cpp.s
+
+test_net.o: test_net.cpp.o
+.PHONY : test_net.o
+
+# target to build an object file
+test_net.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
+.PHONY : test_net.cpp.o
+
+test_net.i: test_net.cpp.i
+.PHONY : test_net.i
+
+# target to preprocess a source file
+test_net.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i
+.PHONY : test_net.cpp.i
+
+test_net.s: test_net.cpp.s
+.PHONY : test_net.s
+
+# target to generate assembly for a file
+test_net.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s
+.PHONY : test_net.cpp.s
+
+test_neuron_layer.o: test_neuron_layer.cpp.o
+.PHONY : test_neuron_layer.o
+
+# target to build an object file
+test_neuron_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
+.PHONY : test_neuron_layer.cpp.o
+
+test_neuron_layer.i: test_neuron_layer.cpp.i
+.PHONY : test_neuron_layer.i
+
+# target to preprocess a source file
+test_neuron_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i
+.PHONY : test_neuron_layer.cpp.i
+
+test_neuron_layer.s: test_neuron_layer.cpp.s
+.PHONY : test_neuron_layer.s
+
+# target to generate assembly for a file
+test_neuron_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s
+.PHONY : test_neuron_layer.cpp.s
+
+test_platform.o: test_platform.cpp.o
+.PHONY : test_platform.o
+
+# target to build an object file
+test_platform.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
+.PHONY : test_platform.cpp.o
+
+test_platform.i: test_platform.cpp.i
+.PHONY : test_platform.i
+
+# target to preprocess a source file
+test_platform.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i
+.PHONY : test_platform.cpp.i
+
+test_platform.s: test_platform.cpp.s
+.PHONY : test_platform.s
+
+# target to generate assembly for a file
+test_platform.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s
+.PHONY : test_platform.cpp.s
+
+test_pooling_layer.o: test_pooling_layer.cpp.o
+.PHONY : test_pooling_layer.o
+
+# target to build an object file
+test_pooling_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
+.PHONY : test_pooling_layer.cpp.o
+
+test_pooling_layer.i: test_pooling_layer.cpp.i
+.PHONY : test_pooling_layer.i
+
+# target to preprocess a source file
+test_pooling_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i
+.PHONY : test_pooling_layer.cpp.i
+
+test_pooling_layer.s: test_pooling_layer.cpp.s
+.PHONY : test_pooling_layer.s
+
+# target to generate assembly for a file
+test_pooling_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s
+.PHONY : test_pooling_layer.cpp.s
+
+test_power_layer.o: test_power_layer.cpp.o
+.PHONY : test_power_layer.o
+
+# target to build an object file
+test_power_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
+.PHONY : test_power_layer.cpp.o
+
+test_power_layer.i: test_power_layer.cpp.i
+.PHONY : test_power_layer.i
+
+# target to preprocess a source file
+test_power_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i
+.PHONY : test_power_layer.cpp.i
+
+test_power_layer.s: test_power_layer.cpp.s
+.PHONY : test_power_layer.s
+
+# target to generate assembly for a file
+test_power_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s
+.PHONY : test_power_layer.cpp.s
+
+test_protobuf.o: test_protobuf.cpp.o
+.PHONY : test_protobuf.o
+
+# target to build an object file
+test_protobuf.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
+.PHONY : test_protobuf.cpp.o
+
+test_protobuf.i: test_protobuf.cpp.i
+.PHONY : test_protobuf.i
+
+# target to preprocess a source file
+test_protobuf.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i
+.PHONY : test_protobuf.cpp.i
+
+test_protobuf.s: test_protobuf.cpp.s
+.PHONY : test_protobuf.s
+
+# target to generate assembly for a file
+test_protobuf.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s
+.PHONY : test_protobuf.cpp.s
+
+test_random_number_generator.o: test_random_number_generator.cpp.o
+.PHONY : test_random_number_generator.o
+
+# target to build an object file
+test_random_number_generator.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
+.PHONY : test_random_number_generator.cpp.o
+
+test_random_number_generator.i: test_random_number_generator.cpp.i
+.PHONY : test_random_number_generator.i
+
+# target to preprocess a source file
+test_random_number_generator.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i
+.PHONY : test_random_number_generator.cpp.i
+
+test_random_number_generator.s: test_random_number_generator.cpp.s
+.PHONY : test_random_number_generator.s
+
+# target to generate assembly for a file
+test_random_number_generator.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s
+.PHONY : test_random_number_generator.cpp.s
+
+test_reduction_layer.o: test_reduction_layer.cpp.o
+.PHONY : test_reduction_layer.o
+
+# target to build an object file
+test_reduction_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
+.PHONY : test_reduction_layer.cpp.o
+
+test_reduction_layer.i: test_reduction_layer.cpp.i
+.PHONY : test_reduction_layer.i
+
+# target to preprocess a source file
+test_reduction_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i
+.PHONY : test_reduction_layer.cpp.i
+
+test_reduction_layer.s: test_reduction_layer.cpp.s
+.PHONY : test_reduction_layer.s
+
+# target to generate assembly for a file
+test_reduction_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s
+.PHONY : test_reduction_layer.cpp.s
+
+test_reshape_layer.o: test_reshape_layer.cpp.o
+.PHONY : test_reshape_layer.o
+
+# target to build an object file
+test_reshape_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
+.PHONY : test_reshape_layer.cpp.o
+
+test_reshape_layer.i: test_reshape_layer.cpp.i
+.PHONY : test_reshape_layer.i
+
+# target to preprocess a source file
+test_reshape_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i
+.PHONY : test_reshape_layer.cpp.i
+
+test_reshape_layer.s: test_reshape_layer.cpp.s
+.PHONY : test_reshape_layer.s
+
+# target to generate assembly for a file
+test_reshape_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s
+.PHONY : test_reshape_layer.cpp.s
+
+test_sigmoid_cross_entropy_loss_layer.o: test_sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : test_sigmoid_cross_entropy_loss_layer.o
+
+# target to build an object file
+test_sigmoid_cross_entropy_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.o
+
+test_sigmoid_cross_entropy_loss_layer.i: test_sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : test_sigmoid_cross_entropy_loss_layer.i
+
+# target to preprocess a source file
+test_sigmoid_cross_entropy_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.i
+
+test_sigmoid_cross_entropy_loss_layer.s: test_sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : test_sigmoid_cross_entropy_loss_layer.s
+
+# target to generate assembly for a file
+test_sigmoid_cross_entropy_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.s
+
+test_slice_layer.o: test_slice_layer.cpp.o
+.PHONY : test_slice_layer.o
+
+# target to build an object file
+test_slice_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
+.PHONY : test_slice_layer.cpp.o
+
+test_slice_layer.i: test_slice_layer.cpp.i
+.PHONY : test_slice_layer.i
+
+# target to preprocess a source file
+test_slice_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i
+.PHONY : test_slice_layer.cpp.i
+
+test_slice_layer.s: test_slice_layer.cpp.s
+.PHONY : test_slice_layer.s
+
+# target to generate assembly for a file
+test_slice_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s
+.PHONY : test_slice_layer.cpp.s
+
+test_softmax_layer.o: test_softmax_layer.cpp.o
+.PHONY : test_softmax_layer.o
+
+# target to build an object file
+test_softmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
+.PHONY : test_softmax_layer.cpp.o
+
+test_softmax_layer.i: test_softmax_layer.cpp.i
+.PHONY : test_softmax_layer.i
+
+# target to preprocess a source file
+test_softmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i
+.PHONY : test_softmax_layer.cpp.i
+
+test_softmax_layer.s: test_softmax_layer.cpp.s
+.PHONY : test_softmax_layer.s
+
+# target to generate assembly for a file
+test_softmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s
+.PHONY : test_softmax_layer.cpp.s
+
+test_softmax_with_loss_layer.o: test_softmax_with_loss_layer.cpp.o
+.PHONY : test_softmax_with_loss_layer.o
+
+# target to build an object file
+test_softmax_with_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
+.PHONY : test_softmax_with_loss_layer.cpp.o
+
+test_softmax_with_loss_layer.i: test_softmax_with_loss_layer.cpp.i
+.PHONY : test_softmax_with_loss_layer.i
+
+# target to preprocess a source file
+test_softmax_with_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i
+.PHONY : test_softmax_with_loss_layer.cpp.i
+
+test_softmax_with_loss_layer.s: test_softmax_with_loss_layer.cpp.s
+.PHONY : test_softmax_with_loss_layer.s
+
+# target to generate assembly for a file
+test_softmax_with_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s
+.PHONY : test_softmax_with_loss_layer.cpp.s
+
+test_solver.o: test_solver.cpp.o
+.PHONY : test_solver.o
+
+# target to build an object file
+test_solver.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
+.PHONY : test_solver.cpp.o
+
+test_solver.i: test_solver.cpp.i
+.PHONY : test_solver.i
+
+# target to preprocess a source file
+test_solver.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i
+.PHONY : test_solver.cpp.i
+
+test_solver.s: test_solver.cpp.s
+.PHONY : test_solver.s
+
+# target to generate assembly for a file
+test_solver.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s
+.PHONY : test_solver.cpp.s
+
+test_split_layer.o: test_split_layer.cpp.o
+.PHONY : test_split_layer.o
+
+# target to build an object file
+test_split_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
+.PHONY : test_split_layer.cpp.o
+
+test_split_layer.i: test_split_layer.cpp.i
+.PHONY : test_split_layer.i
+
+# target to preprocess a source file
+test_split_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i
+.PHONY : test_split_layer.cpp.i
+
+test_split_layer.s: test_split_layer.cpp.s
+.PHONY : test_split_layer.s
+
+# target to generate assembly for a file
+test_split_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s
+.PHONY : test_split_layer.cpp.s
+
+test_spp_layer.o: test_spp_layer.cpp.o
+.PHONY : test_spp_layer.o
+
+# target to build an object file
+test_spp_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
+.PHONY : test_spp_layer.cpp.o
+
+test_spp_layer.i: test_spp_layer.cpp.i
+.PHONY : test_spp_layer.i
+
+# target to preprocess a source file
+test_spp_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i
+.PHONY : test_spp_layer.cpp.i
+
+test_spp_layer.s: test_spp_layer.cpp.s
+.PHONY : test_spp_layer.s
+
+# target to generate assembly for a file
+test_spp_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s
+.PHONY : test_spp_layer.cpp.s
+
+test_stochastic_pooling.o: test_stochastic_pooling.cpp.o
+.PHONY : test_stochastic_pooling.o
+
+# target to build an object file
+test_stochastic_pooling.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
+.PHONY : test_stochastic_pooling.cpp.o
+
+test_stochastic_pooling.i: test_stochastic_pooling.cpp.i
+.PHONY : test_stochastic_pooling.i
+
+# target to preprocess a source file
+test_stochastic_pooling.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i
+.PHONY : test_stochastic_pooling.cpp.i
+
+test_stochastic_pooling.s: test_stochastic_pooling.cpp.s
+.PHONY : test_stochastic_pooling.s
+
+# target to generate assembly for a file
+test_stochastic_pooling.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s
+.PHONY : test_stochastic_pooling.cpp.s
+
+test_syncedmem.o: test_syncedmem.cpp.o
+.PHONY : test_syncedmem.o
+
+# target to build an object file
+test_syncedmem.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
+.PHONY : test_syncedmem.cpp.o
+
+test_syncedmem.i: test_syncedmem.cpp.i
+.PHONY : test_syncedmem.i
+
+# target to preprocess a source file
+test_syncedmem.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i
+.PHONY : test_syncedmem.cpp.i
+
+test_syncedmem.s: test_syncedmem.cpp.s
+.PHONY : test_syncedmem.s
+
+# target to generate assembly for a file
+test_syncedmem.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s
+.PHONY : test_syncedmem.cpp.s
+
+test_tanh_layer.o: test_tanh_layer.cpp.o
+.PHONY : test_tanh_layer.o
+
+# target to build an object file
+test_tanh_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
+.PHONY : test_tanh_layer.cpp.o
+
+test_tanh_layer.i: test_tanh_layer.cpp.i
+.PHONY : test_tanh_layer.i
+
+# target to preprocess a source file
+test_tanh_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i
+.PHONY : test_tanh_layer.cpp.i
+
+test_tanh_layer.s: test_tanh_layer.cpp.s
+.PHONY : test_tanh_layer.s
+
+# target to generate assembly for a file
+test_tanh_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s
+.PHONY : test_tanh_layer.cpp.s
+
+test_threshold_layer.o: test_threshold_layer.cpp.o
+.PHONY : test_threshold_layer.o
+
+# target to build an object file
+test_threshold_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
+.PHONY : test_threshold_layer.cpp.o
+
+test_threshold_layer.i: test_threshold_layer.cpp.i
+.PHONY : test_threshold_layer.i
+
+# target to preprocess a source file
+test_threshold_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i
+.PHONY : test_threshold_layer.cpp.i
+
+test_threshold_layer.s: test_threshold_layer.cpp.s
+.PHONY : test_threshold_layer.s
+
+# target to generate assembly for a file
+test_threshold_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s
+.PHONY : test_threshold_layer.cpp.s
+
+test_upgrade_proto.o: test_upgrade_proto.cpp.o
+.PHONY : test_upgrade_proto.o
+
+# target to build an object file
+test_upgrade_proto.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
+.PHONY : test_upgrade_proto.cpp.o
+
+test_upgrade_proto.i: test_upgrade_proto.cpp.i
+.PHONY : test_upgrade_proto.i
+
+# target to preprocess a source file
+test_upgrade_proto.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i
+.PHONY : test_upgrade_proto.cpp.i
+
+test_upgrade_proto.s: test_upgrade_proto.cpp.s
+.PHONY : test_upgrade_proto.s
+
+# target to generate assembly for a file
+test_upgrade_proto.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s
+.PHONY : test_upgrade_proto.cpp.s
+
+test_util_blas.o: test_util_blas.cpp.o
+.PHONY : test_util_blas.o
+
+# target to build an object file
+test_util_blas.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
+.PHONY : test_util_blas.cpp.o
+
+test_util_blas.i: test_util_blas.cpp.i
+.PHONY : test_util_blas.i
+
+# target to preprocess a source file
+test_util_blas.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i
+.PHONY : test_util_blas.cpp.i
+
+test_util_blas.s: test_util_blas.cpp.s
+.PHONY : test_util_blas.s
+
+# target to generate assembly for a file
+test_util_blas.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s
+.PHONY : test_util_blas.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... runtest"
+	@echo "... test.testbin"
+	@echo "... test_accuracy_layer.o"
+	@echo "... test_accuracy_layer.i"
+	@echo "... test_accuracy_layer.s"
+	@echo "... test_argmax_layer.o"
+	@echo "... test_argmax_layer.i"
+	@echo "... test_argmax_layer.s"
+	@echo "... test_benchmark.o"
+	@echo "... test_benchmark.i"
+	@echo "... test_benchmark.s"
+	@echo "... test_blob.o"
+	@echo "... test_blob.i"
+	@echo "... test_blob.s"
+	@echo "... test_caffe_main.o"
+	@echo "... test_caffe_main.i"
+	@echo "... test_caffe_main.s"
+	@echo "... test_common.o"
+	@echo "... test_common.i"
+	@echo "... test_common.s"
+	@echo "... test_concat_layer.o"
+	@echo "... test_concat_layer.i"
+	@echo "... test_concat_layer.s"
+	@echo "... test_contrastive_loss_layer.o"
+	@echo "... test_contrastive_loss_layer.i"
+	@echo "... test_contrastive_loss_layer.s"
+	@echo "... test_convolution_layer.o"
+	@echo "... test_convolution_layer.i"
+	@echo "... test_convolution_layer.s"
+	@echo "... test_data_layer.o"
+	@echo "... test_data_layer.i"
+	@echo "... test_data_layer.s"
+	@echo "... test_data_transformer.o"
+	@echo "... test_data_transformer.i"
+	@echo "... test_data_transformer.s"
+	@echo "... test_db.o"
+	@echo "... test_db.i"
+	@echo "... test_db.s"
+	@echo "... test_deconvolution_layer.o"
+	@echo "... test_deconvolution_layer.i"
+	@echo "... test_deconvolution_layer.s"
+	@echo "... test_dummy_data_layer.o"
+	@echo "... test_dummy_data_layer.i"
+	@echo "... test_dummy_data_layer.s"
+	@echo "... test_eltwise_layer.o"
+	@echo "... test_eltwise_layer.i"
+	@echo "... test_eltwise_layer.s"
+	@echo "... test_euclidean_loss_layer.o"
+	@echo "... test_euclidean_loss_layer.i"
+	@echo "... test_euclidean_loss_layer.s"
+	@echo "... test_filler.o"
+	@echo "... test_filler.i"
+	@echo "... test_filler.s"
+	@echo "... test_filter_layer.o"
+	@echo "... test_filter_layer.i"
+	@echo "... test_filter_layer.s"
+	@echo "... test_flatten_layer.o"
+	@echo "... test_flatten_layer.i"
+	@echo "... test_flatten_layer.s"
+	@echo "... test_gradient_based_solver.o"
+	@echo "... test_gradient_based_solver.i"
+	@echo "... test_gradient_based_solver.s"
+	@echo "... test_hdf5_output_layer.o"
+	@echo "... test_hdf5_output_layer.i"
+	@echo "... test_hdf5_output_layer.s"
+	@echo "... test_hdf5data_layer.o"
+	@echo "... test_hdf5data_layer.i"
+	@echo "... test_hdf5data_layer.s"
+	@echo "... test_hinge_loss_layer.o"
+	@echo "... test_hinge_loss_layer.i"
+	@echo "... test_hinge_loss_layer.s"
+	@echo "... test_im2col_layer.o"
+	@echo "... test_im2col_layer.i"
+	@echo "... test_im2col_layer.s"
+	@echo "... test_image_data_layer.o"
+	@echo "... test_image_data_layer.i"
+	@echo "... test_image_data_layer.s"
+	@echo "... test_infogain_loss_layer.o"
+	@echo "... test_infogain_loss_layer.i"
+	@echo "... test_infogain_loss_layer.s"
+	@echo "... test_inner_product_layer.o"
+	@echo "... test_inner_product_layer.i"
+	@echo "... test_inner_product_layer.s"
+	@echo "... test_internal_thread.o"
+	@echo "... test_internal_thread.i"
+	@echo "... test_internal_thread.s"
+	@echo "... test_io.o"
+	@echo "... test_io.i"
+	@echo "... test_io.s"
+	@echo "... test_layer_factory.o"
+	@echo "... test_layer_factory.i"
+	@echo "... test_layer_factory.s"
+	@echo "... test_lrn_layer.o"
+	@echo "... test_lrn_layer.i"
+	@echo "... test_lrn_layer.s"
+	@echo "... test_math_functions.o"
+	@echo "... test_math_functions.i"
+	@echo "... test_math_functions.s"
+	@echo "... test_maxpool_dropout_layers.o"
+	@echo "... test_maxpool_dropout_layers.i"
+	@echo "... test_maxpool_dropout_layers.s"
+	@echo "... test_memory_data_layer.o"
+	@echo "... test_memory_data_layer.i"
+	@echo "... test_memory_data_layer.s"
+	@echo "... test_multinomial_logistic_loss_layer.o"
+	@echo "... test_multinomial_logistic_loss_layer.i"
+	@echo "... test_multinomial_logistic_loss_layer.s"
+	@echo "... test_mvn_layer.o"
+	@echo "... test_mvn_layer.i"
+	@echo "... test_mvn_layer.s"
+	@echo "... test_net.o"
+	@echo "... test_net.i"
+	@echo "... test_net.s"
+	@echo "... test_neuron_layer.o"
+	@echo "... test_neuron_layer.i"
+	@echo "... test_neuron_layer.s"
+	@echo "... test_platform.o"
+	@echo "... test_platform.i"
+	@echo "... test_platform.s"
+	@echo "... test_pooling_layer.o"
+	@echo "... test_pooling_layer.i"
+	@echo "... test_pooling_layer.s"
+	@echo "... test_power_layer.o"
+	@echo "... test_power_layer.i"
+	@echo "... test_power_layer.s"
+	@echo "... test_protobuf.o"
+	@echo "... test_protobuf.i"
+	@echo "... test_protobuf.s"
+	@echo "... test_random_number_generator.o"
+	@echo "... test_random_number_generator.i"
+	@echo "... test_random_number_generator.s"
+	@echo "... test_reduction_layer.o"
+	@echo "... test_reduction_layer.i"
+	@echo "... test_reduction_layer.s"
+	@echo "... test_reshape_layer.o"
+	@echo "... test_reshape_layer.i"
+	@echo "... test_reshape_layer.s"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.o"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.i"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.s"
+	@echo "... test_slice_layer.o"
+	@echo "... test_slice_layer.i"
+	@echo "... test_slice_layer.s"
+	@echo "... test_softmax_layer.o"
+	@echo "... test_softmax_layer.i"
+	@echo "... test_softmax_layer.s"
+	@echo "... test_softmax_with_loss_layer.o"
+	@echo "... test_softmax_with_loss_layer.i"
+	@echo "... test_softmax_with_loss_layer.s"
+	@echo "... test_solver.o"
+	@echo "... test_solver.i"
+	@echo "... test_solver.s"
+	@echo "... test_split_layer.o"
+	@echo "... test_split_layer.i"
+	@echo "... test_split_layer.s"
+	@echo "... test_spp_layer.o"
+	@echo "... test_spp_layer.i"
+	@echo "... test_spp_layer.s"
+	@echo "... test_stochastic_pooling.o"
+	@echo "... test_stochastic_pooling.i"
+	@echo "... test_stochastic_pooling.s"
+	@echo "... test_syncedmem.o"
+	@echo "... test_syncedmem.i"
+	@echo "... test_syncedmem.s"
+	@echo "... test_tanh_layer.o"
+	@echo "... test_tanh_layer.i"
+	@echo "... test_tanh_layer.s"
+	@echo "... test_threshold_layer.o"
+	@echo "... test_threshold_layer.i"
+	@echo "... test_threshold_layer.s"
+	@echo "... test_upgrade_proto.o"
+	@echo "... test_upgrade_proto.i"
+	@echo "... test_upgrade_proto.s"
+	@echo "... test_util_blas.o"
+	@echo "... test_util_blas.i"
+	@echo "... test_util_blas.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/caffe/test/cmake_install.cmake b/src/caffe/test/cmake_install.cmake
new file mode 100644
index 00000000..fa890cd7
--- /dev/null
+++ b/src/caffe/test/cmake_install.cmake
@@ -0,0 +1,34 @@
+# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test
+
+# Set the install prefix
+IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
+ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  IF(BUILD_TYPE)
+    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  ELSE(BUILD_TYPE)
+    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
+  ENDIF(BUILD_TYPE)
+  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+
+# Set the component getting installed.
+IF(NOT CMAKE_INSTALL_COMPONENT)
+  IF(COMPONENT)
+    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
+    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  ELSE(COMPONENT)
+    SET(CMAKE_INSTALL_COMPONENT)
+  ENDIF(COMPONENT)
+ENDIF(NOT CMAKE_INSTALL_COMPONENT)
+
+# Install shared libraries without execute permission?
+IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  SET(CMAKE_INSTALL_SO_NO_EXE "1")
+ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index c8caf5ac..5f41d325 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -2,6 +2,7 @@
 // to allow a main function to be compiled into the binary.
 
 #include "caffe/caffe.hpp"
+#include "caffe/common.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
 namespace caffe {
@@ -12,6 +13,7 @@ namespace caffe {
 
 #ifndef CPU_ONLY
 using caffe::CAFFE_TEST_CUDA_PROP;
+
 #endif
 
 int main(int argc, char** argv) {
@@ -19,8 +21,8 @@ int main(int argc, char** argv) {
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
   // Before starting testing, let's first print out a few cuda defice info.
-  int device;
-  cudaGetDeviceCount(&device);
+  int device = 0;
+//  cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;
   if (argc > 1) {
     // Use the given device
@@ -31,9 +33,11 @@ int main(int argc, char** argv) {
     // Use the device assigned in build configuration; but with a lower priority
     device = CUDA_TEST_DEVICE;
   }
-  cudaGetDevice(&device);
+//  cudaGetDevice(&device);
   cout << "Current device id: " << device << endl;
-  cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
+ // cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
+//  caffe::set_mode(caffe::GPU);
+  caffe::amdDevice.Init();
 #endif
   // invoke the test.
   return RUN_ALL_TESTS();
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index a8c5a83f..6942f8a3 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -25,15 +25,7 @@ Timer::~Timer() {
 
 void Timer::Start() {
   if (!running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(start_gpu_, 0));
-#else
-      NO_GPU;
-#endif
-    } else {
-      start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
+    start_cpu_ = boost::posix_time::microsec_clock::local_time();
     running_ = true;
     has_run_at_least_once_ = true;
   }
@@ -41,16 +33,7 @@ void Timer::Start() {
 
 void Timer::Stop() {
   if (running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
-      CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
-#else
-      NO_GPU;
-#endif
-    } else {
-      stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
+    stop_cpu_ = boost::posix_time::microsec_clock::local_time();
     running_ = false;
   }
 }
@@ -64,18 +47,8 @@ float Timer::MicroSeconds() {
   if (running()) {
     Stop();
   }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-    // Cuda only measure milliseconds
-    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
-  }
+  
+  elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
   return elapsed_microseconds_;
 }
 
@@ -87,16 +60,8 @@ float Timer::MilliSeconds() {
   if (running()) {
     Stop();
   }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
-  }
+ 
+  elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
   return elapsed_milliseconds_;
 }
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 677afcdf..3bef8b63 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -214,6 +214,26 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
     CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) );
 }
 
+template<>
+void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y)
+{
+}
+
+template<>
+void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y)
+{
+}
+
+template<>
+void caffe_gpu_abs<float>(const int n, const float* x, float* y)
+{
+}
+
+template<>
+void caffe_gpu_abs<double>(const int n, const double* x, double* y)
+{
+}
+
 template <>
 void caffe_set(const int N, const float alpha, float* Y) {
   if (alpha == 0) {
@@ -260,6 +280,12 @@ void caffe_copy<double>(const int N, const double* X, double* Y) {
   cblas_dcopy(N, X, 1, Y, 1);
 }
 
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
+{
+   OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+   clFinish(amdDevice.CommandQueue);
+}
+
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
   if(X != Y)
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 01c04711..e4fd42c6 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -51,6 +51,7 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count){
 }
 
 // Explicit instantiation
+template void ocl_memset<int>(int* buffer, const int value, const int count);
 template void ocl_memset<float>(float* buffer, const float value, const int count);
 template void ocl_memset<double>(double* buffer, const double value, const int count);
 
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index a9abda2e..9eab08ec 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -35,8 +35,8 @@
 namespace caffe {
 typedef unsigned int uint32_t;
 struct array4x32 {  uint32_t v[4]; };
-
-template <typename dtype> std::string get_dtype_suffix()
+/*
+template <typename dtype> inline std::string get_dtype_suffix()
 {
     dtype x;
     const char type = typeid(x).name()[0];
@@ -49,7 +49,7 @@ template <typename dtype> std::string get_dtype_suffix()
     }
     return suffix;
 }
-
+*/
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold)
 {
@@ -1083,5 +1083,75 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
 }
 template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
 template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
-}  // namespace caffe
 
+template <typename Dtype>
+void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
+{
+/*        std::string kernel_name = "Conv" + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+
+        int weights_stride = kernel_w * kernel_h;//correct?
+        int bot_stride = width;
+        int bot_channel_stride = width * height; 
+        int bot_batch_stride = width * height * channel_in;
+
+        int top_stride = width_out;
+        int top_channel_stride = width_out * height_out;
+        int top_batch_stride = width_out * height_out * channel_out;
+
+        //int height_out = (int)top->getDim(ANN_TENSOR_HEIGHT);
+        //int width_out = (int)top->getDim(ANN_TENSOR_WIDTH);
+        int vis_height = height_out * stride - 2 * pad;
+        int vis_width = width_out * stride - 2 * pad;
+
+        int ocl_group_sz0_ = 8;
+        int ocl_group_sz1_ = 8;
+        int ocl_group_lg2sz1_ = (int)ceil(log((double)ocl_group_sz1_)/log(2.));
+        int ocl_group_lg2sz0_ = (int)ceil(log((double)ocl_group_sz0_)/log(2.));
+        
+        int outputs = channel_out;
+        int n_out_pix_horiz_ = (width_out < 2 * ocl_group_sz0_) ? 1 : (width_out < 4 * ocl_group_sz0_) ? 2 : 4;
+        int n_out_pix_vert_ = (height_out < 2 * ocl_group_sz1_) ? 1 : 2; // (height_out <= 192) ? 2 : 4;
+        int n_outs_ = ((outputs & 1) == 1) ? 1 : (kernel_w == 3) && ((outputs / 4) * 4 == outputs) ? 4 : 2; // (n_out_pix_horiz_ >= 4) ? 1 : 2;
+
+        int n_outputs = channel_out;
+        n_outputs /= n_outs_;
+        int i_n_group_horiz = (width_out + ocl_group_sz0_ * n_out_pix_horiz_ - 1) / (ocl_group_sz0_ * n_out_pix_horiz_);
+        int i_n_group_vert = (height_out + ocl_group_sz1_ * n_out_pix_vert_ - 1) / (ocl_group_sz1_ * n_out_pix_vert_);
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&bottom_data);
+        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&weights);
+        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&bias);
+        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&top_data);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(cl_int),   (void*)&kernel_w);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&channel_out);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&channel_in);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&pad);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_out_pix_horiz_);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_out_pix_vert_);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_batch_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_channel_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_batch_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_channel_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&vis_width);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&vis_height);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&weights_stride);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&width_out);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&height_out);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_outs_);
+        OCL_CHECK(ret);
+
+         size_t l_wk[3] = { ocl_group_sz0_, ocl_group_sz1_, 1};
+	 size_t g_wk[3] = { i_n_group_horiz * l_wk[0], i_n_group_vert * l_wk[1], batch_sz * n_outputs };
+
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );*/
+}
+template void ocl_conv<float>(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
+template void ocl_conv<double>(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
+
+}  // namespace caffe
diff --git a/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake b/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 00000000..7bb0014c
--- /dev/null
+++ b/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Relative path conversion top directories.
+SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
+
+# Force unix paths in dependencies.
+SET(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake b/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
new file mode 100644
index 00000000..76e46409
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
@@ -0,0 +1,32 @@
+# The set of languages for which implicit dependencies are needed:
+SET(CMAKE_DEPENDS_LANGUAGES
+  "CXX"
+  )
+# The set of files for implicit dependencies of each language:
+SET(CMAKE_DEPENDS_CHECK_CXX
+  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o"
+  )
+SET(CMAKE_CXX_COMPILER_ID "GNU")
+
+# Preprocessor definitions for this target.
+SET(CMAKE_TARGET_DEFINITIONS
+  "GTEST_USE_OWN_TR1_TUPLE"
+  )
+
+# Targets to which this target links.
+SET(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# The include file search paths:
+SET(CMAKE_C_TARGET_INCLUDE_PATH
+  "src"
+  "/usr/local/include"
+  "include"
+  "/usr/local/cuda/include"
+  "/usr/local/include/opencv"
+  "/usr/include/atlas"
+  "."
+  )
+SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
+SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/gtest/CMakeFiles/gtest.dir/build.make b/src/gtest/CMakeFiles/gtest.dir/build.make
new file mode 100644
index 00000000..b41ed414
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/build.make
@@ -0,0 +1,106 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# Include any dependencies generated for this target.
+include src/gtest/CMakeFiles/gtest.dir/depend.make
+
+# Include the progress variables for this target.
+include src/gtest/CMakeFiles/gtest.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include src/gtest/CMakeFiles/gtest.dir/flags.make
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/CMakeFiles/gtest.dir/flags.make
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/gtest-all.cpp
+	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/gtest.dir/gtest-all.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/gtest.dir/gtest-all.cpp.i"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp > CMakeFiles/gtest.dir/gtest-all.cpp.i
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/gtest.dir/gtest-all.cpp.s"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp -o CMakeFiles/gtest.dir/gtest-all.cpp.s
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires:
+.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
+	$(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build
+.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides
+
+src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
+
+# Object files for target gtest
+gtest_OBJECTS = \
+"CMakeFiles/gtest.dir/gtest-all.cpp.o"
+
+# External object files for target gtest
+gtest_EXTERNAL_OBJECTS =
+
+lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
+lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/build.make
+lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libgtest.a"
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean_target.cmake
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/gtest.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+src/gtest/CMakeFiles/gtest.dir/build: lib/libgtest.a
+.PHONY : src/gtest/CMakeFiles/gtest.dir/build
+
+src/gtest/CMakeFiles/gtest.dir/requires: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
+.PHONY : src/gtest/CMakeFiles/gtest.dir/requires
+
+src/gtest/CMakeFiles/gtest.dir/clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean.cmake
+.PHONY : src/gtest/CMakeFiles/gtest.dir/clean
+
+src/gtest/CMakeFiles/gtest.dir/depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/gtest/CMakeFiles/gtest.dir/depend
+
diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
new file mode 100644
index 00000000..694feb83
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
@@ -0,0 +1,10 @@
+FILE(REMOVE_RECURSE
+  "CMakeFiles/gtest.dir/gtest-all.cpp.o"
+  "../../lib/libgtest.pdb"
+  "../../lib/libgtest.a"
+)
+
+# Per-language clean rules from dependency scanning.
+FOREACH(lang CXX)
+  INCLUDE(CMakeFiles/gtest.dir/cmake_clean_${lang}.cmake OPTIONAL)
+ENDFOREACH(lang)
diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
new file mode 100644
index 00000000..2c9ec14f
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
@@ -0,0 +1,3 @@
+FILE(REMOVE_RECURSE
+  "../../lib/libgtest.a"
+)
diff --git a/src/gtest/CMakeFiles/gtest.dir/depend.make b/src/gtest/CMakeFiles/gtest.dir/depend.make
new file mode 100644
index 00000000..37ac348d
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for gtest.
+# This may be replaced when dependencies are built.
diff --git a/src/gtest/CMakeFiles/gtest.dir/flags.make b/src/gtest/CMakeFiles/gtest.dir/flags.make
new file mode 100644
index 00000000..8b4ef992
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/flags.make
@@ -0,0 +1,8 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# compile CXX with /usr/bin/c++
+CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
+
+CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
+
diff --git a/src/gtest/CMakeFiles/gtest.dir/link.txt b/src/gtest/CMakeFiles/gtest.dir/link.txt
new file mode 100644
index 00000000..e5645cfb
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/link.txt
@@ -0,0 +1,2 @@
+/usr/bin/ar cr ../../lib/libgtest.a  CMakeFiles/gtest.dir/gtest-all.cpp.o
+/usr/bin/ranlib ../../lib/libgtest.a
diff --git a/src/gtest/CMakeFiles/gtest.dir/progress.make b/src/gtest/CMakeFiles/gtest.dir/progress.make
new file mode 100644
index 00000000..143c9b1b
--- /dev/null
+++ b/src/gtest/CMakeFiles/gtest.dir/progress.make
@@ -0,0 +1,2 @@
+CMAKE_PROGRESS_1 = 65
+
diff --git a/src/gtest/CMakeFiles/progress.marks b/src/gtest/CMakeFiles/progress.marks
new file mode 100644
index 00000000..573541ac
--- /dev/null
+++ b/src/gtest/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+0
diff --git a/src/gtest/Makefile b/src/gtest/Makefile
new file mode 100644
index 00000000..d1a96ceb
--- /dev/null
+++ b/src/gtest/Makefile
@@ -0,0 +1,212 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: install/local
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: install/strip
+.PHONY : install/strip/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/progress.marks
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/gtest/CMakeFiles/gtest.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/CMakeFiles/gtest.dir/rule
+.PHONY : src/gtest/CMakeFiles/gtest.dir/rule
+
+# Convenience name for target.
+gtest: src/gtest/CMakeFiles/gtest.dir/rule
+.PHONY : gtest
+
+# fast build rule for target.
+gtest/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/build
+.PHONY : gtest/fast
+
+gtest-all.o: gtest-all.cpp.o
+.PHONY : gtest-all.o
+
+# target to build an object file
+gtest-all.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
+.PHONY : gtest-all.cpp.o
+
+gtest-all.i: gtest-all.cpp.i
+.PHONY : gtest-all.i
+
+# target to preprocess a source file
+gtest-all.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i
+.PHONY : gtest-all.cpp.i
+
+gtest-all.s: gtest-all.cpp.s
+.PHONY : gtest-all.s
+
+# target to generate assembly for a file
+gtest-all.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s
+.PHONY : gtest-all.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... gtest"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... gtest-all.o"
+	@echo "... gtest-all.i"
+	@echo "... gtest-all.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/gtest/cmake_install.cmake b/src/gtest/cmake_install.cmake
new file mode 100644
index 00000000..14c33dd5
--- /dev/null
+++ b/src/gtest/cmake_install.cmake
@@ -0,0 +1,34 @@
+# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest
+
+# Set the install prefix
+IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
+ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  IF(BUILD_TYPE)
+    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  ELSE(BUILD_TYPE)
+    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
+  ENDIF(BUILD_TYPE)
+  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+
+# Set the component getting installed.
+IF(NOT CMAKE_INSTALL_COMPONENT)
+  IF(COMPONENT)
+    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
+    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  ELSE(COMPONENT)
+    SET(CMAKE_INSTALL_COMPONENT)
+  ENDIF(COMPONENT)
+ENDIF(NOT CMAKE_INSTALL_COMPONENT)
+
+# Install shared libraries without execute permission?
+IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  SET(CMAKE_INSTALL_SO_NO_EXE "1")
+ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+

From a45174ceee5506f935d4b0ac16e8b516440bea61 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 25 Aug 2015 14:36:34 +0800
Subject: [PATCH 036/124] modified the packing number

---
 include/caffe/common.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 4cd372a6..8113c181 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -84,7 +84,7 @@ private:\
 #define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-#define global_packing_N 32
+#define global_packing_N 16
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ

From 5822b9357570bebe66f3ff69690c6280b5e782be Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 27 Aug 2015 15:00:54 +0800
Subject: [PATCH 037/124] remove all cuda related flags in Makefile

---
 Makefile        | 75 ++++++++-----------------------------------------
 Makefile.config |  4 +--
 2 files changed, 13 insertions(+), 66 deletions(-)

diff --git a/Makefile b/Makefile
index f0ac9e06..905a19c3 100644
--- a/Makefile
+++ b/Makefile
@@ -38,13 +38,10 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so
 ##############################
 # CXX_SRCS are the source files excluding the test ones.
 CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp")
-# CU_SRCS are the cuda source files
-#CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu")
 # TEST_SRCS are the test source files
 TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp
 TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp")
 TEST_SRCS := $(filter-out $(TEST_MAIN_SRC), $(TEST_SRCS))
-TEST_CU_SRCS := $(shell find src/$(PROJECT) -name "test_*.cu")
 GTEST_SRC := src/gtest/gtest-all.cpp
 # TOOL_SRCS are the source files for the tool binaries
 TOOL_SRCS := $(shell find tools -name "*.cpp")
@@ -68,7 +65,7 @@ NONGEN_CXX_SRCS := $(shell find \
 	matlab/+$(PROJECT)/private \
 	examples \
 	tools \
-	-name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh")
+	-name "*.cpp" -or -name "*.hpp")
 LINT_SCRIPT := scripts/cpp_lint.py
 LINT_OUTPUT_DIR := $(BUILD_DIR)/.lint
 LINT_EXT := lint.txt
@@ -103,22 +100,19 @@ PROTO_GEN_PY := $(foreach file,${PROTO_SRCS:.proto=_pb2.py}, \
 # These objects will be linked into the final shared library, so we
 # exclude the tool, example, and test objects.
 CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o})
-CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o})
 PROTO_OBJS := ${PROTO_GEN_CC:.cc=.o}
-OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS)
+OBJS := $(PROTO_OBJS) $(CXX_OBJS) 
 # tool, example, and test objects
 TOOL_OBJS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o})
 TOOL_BUILD_DIR := $(BUILD_DIR)/tools
 TEST_CXX_BUILD_DIR := $(BUILD_DIR)/src/$(PROJECT)/test
-TEST_CU_BUILD_DIR := $(BUILD_DIR)/cuda/src/$(PROJECT)/test
 TEST_CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o})
-TEST_CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o})
-TEST_OBJS := $(TEST_CXX_OBJS) $(TEST_CU_OBJS)
+TEST_OBJS := $(TEST_CXX_OBJS) 
 GTEST_OBJ := $(addprefix $(BUILD_DIR)/, ${GTEST_SRC:.cpp=.o})
 EXAMPLE_OBJS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o})
 # Output files for automatic dependency generation
-DEPS := ${CXX_OBJS:.o=.d} ${CU_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \
-	${TEST_CU_OBJS:.o=.d} $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d}
+DEPS := ${CXX_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \
+	 $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d}
 # tool, example, and test bins
 TOOL_BINS := ${TOOL_OBJS:.o=.bin}
 EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin}
@@ -126,11 +120,9 @@ EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin}
 TOOL_BIN_LINKS := ${TOOL_BINS:.bin=}
 # Put the test binaries in build/test for convenience.
 TEST_BIN_DIR := $(BUILD_DIR)/test
-TEST_CU_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \
-		$(foreach obj,$(TEST_CU_OBJS),$(basename $(notdir $(obj))))))
 TEST_CXX_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \
 		$(foreach obj,$(TEST_CXX_OBJS),$(basename $(notdir $(obj))))))
-TEST_BINS := $(TEST_CXX_BINS) $(TEST_CU_BINS)
+TEST_BINS := $(TEST_CXX_BINS) 
 # TEST_ALL_BIN is the test binary that links caffe dynamically.
 TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin
 
@@ -139,30 +131,15 @@ TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin
 ##############################
 WARNS_EXT := warnings.txt
 CXX_WARNS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o.$(WARNS_EXT)})
-CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o.$(WARNS_EXT)})
 TOOL_WARNS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o.$(WARNS_EXT)})
 EXAMPLE_WARNS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o.$(WARNS_EXT)})
 TEST_WARNS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o.$(WARNS_EXT)})
-TEST_CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o.$(WARNS_EXT)})
 ALL_CXX_WARNS := $(CXX_WARNS) $(TOOL_WARNS) $(EXAMPLE_WARNS) $(TEST_WARNS)
-ALL_CU_WARNS := $(CU_WARNS) $(TEST_CU_WARNS)
-ALL_WARNS := $(ALL_CXX_WARNS) $(ALL_CU_WARNS)
+ALL_WARNS := $(ALL_CXX_WARNS) 
 
 EMPTY_WARN_REPORT := $(BUILD_DIR)/.$(WARNS_EXT)
 NONEMPTY_WARN_REPORT := $(BUILD_DIR)/$(WARNS_EXT)
 
-##############################
-# Derive include and lib directories
-##############################
-CUDA_INCLUDE_DIR := $(CUDA_DIR)/include
-
-CUDA_LIB_DIR :=
-# add <cuda>/lib64 only if it exists
-ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
-	CUDA_LIB_DIR += $(CUDA_DIR)/lib64
-endif
-CUDA_LIB_DIR += $(CUDA_DIR)/lib
-
 #################################
 # OpenCL include and library 
 #################################
@@ -189,10 +166,6 @@ endif
 
 INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
 ifneq ($(CPU_ONLY), 1)
-	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
-	LIBRARY_DIRS += $(CUDA_LIB_DIR)
-	LIBRARIES := cudart cublas curand
-        
         INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR)
         LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR)
         LIBRARIES += OpenCL clBLAS
@@ -216,7 +189,6 @@ ifneq ($(strip $(DISTRIBUTE_DIR)),distribute)
 endif
 
 ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)) \
-	$(addprefix $(BUILD_DIR)/cuda/, $(SRC_DIRS)) \
 	$(LIB_BUILD_DIR) $(TEST_BIN_DIR) $(PY_PROTO_BUILD_DIR) $(LINT_OUTPUT_DIR) \
 	$(DISTRIBUTE_SUBDIRS) $(PROTO_BUILD_INCLUDE_DIR))
 
@@ -235,7 +207,7 @@ DOXYGEN_SOURCES := $(shell find \
 	matlab/ \
 	examples \
 	tools \
-	-name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh" -or \
+	-name "*.cpp" -or -name "*.hpp"  -or \
         -name "*.py" -or -name "*.m")
 DOXYGEN_SOURCES += $(DOXYGEN_CONFIG_FILE)
 
@@ -271,13 +243,8 @@ endif
 ifeq ($(OSX), 1)
 	CXX := /usr/bin/clang++
 	ifneq ($(CPU_ONLY), 1)
-		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release \d' | grep -o '\d')
-		ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
-			CXXFLAGS += -stdlib=libstdc++
-			LINKFLAGS += -stdlib=libstdc++
-		endif
-		# clang throws this warning for cuda headers
-		WARNINGS += -Wno-unneeded-internal-declaration
+	    # todo
+            #############
 	endif
 	# gtest needs to use its own tuple to not conflict with clang
 	COMMON_FLAGS += -DGTEST_USE_OWN_TR1_TUPLE=1
@@ -313,12 +280,6 @@ else
 	COMMON_FLAGS += -DNDEBUG -O2
 endif
 
-# cuDNN acceleration configuration.
-ifeq ($(USE_CUDNN), 1)
-	LIBRARIES += cudnn
-	COMMON_FLAGS += -DUSE_CUDNN
-endif
-
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -403,7 +364,7 @@ PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 #
 # * Recursive with the exception that symbolic links are never followed, per the
 # default behavior of 'find'.
-SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py .cuo
+SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py 
 
 # Set the sub-targets of the 'everything' target.
 EVERYTHING_TARGETS := all py$(PROJECT) test warn lint
@@ -554,26 +515,12 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \
 		|| (cat $@.$(WARNS_EXT); exit 1)
 	@ cat $@.$(WARNS_EXT)
 
-#$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS)
-#	@ echo NVCC $<
-#	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
-#		-odir $(@D)
-#	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \
-#		|| (cat $@.$(WARNS_EXT); exit 1)
-#	@ cat $@.$(WARNS_EXT)
-
 $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		| $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
 
-$(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \
-	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
-	@ echo LD $<
-	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
-
 $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
diff --git a/Makefile.config b/Makefile.config
index 2d8124d6..829e2732 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -12,14 +12,14 @@
 # CUSTOM_CXX := g++
 
 # CUDA directory contains bin/ and lib/ directories that we need.
-CUDA_DIR := /usr/local/cuda
+#CUDA_DIR := /usr/local/cuda
 # On Ubuntu 14.04, if cuda tools are installed via
 # "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
 # CUDA_DIR := /usr
 
 # CUDA architecture setting: going with all of them.
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
-CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+#CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \
 		-gencode arch=compute_35,code=sm_35 \

From 4e424b45014446459e2142ffb9a0dd24512e56be Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 25 Aug 2015 14:36:34 +0800
Subject: [PATCH 038/124] modified the packing number

---
 include/caffe/common.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 4cd372a6..8113c181 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -84,7 +84,7 @@ private:\
 #define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-#define global_packing_N 32
+#define global_packing_N 16
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ

From 02762d4d22125a59673e30e50da2fb5da07b6927 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 27 Aug 2015 23:22:31 +0800
Subject: [PATCH 039/124] add clFinish in test

---
 tools/caffe.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index e350866f..d7953bdd 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -7,6 +7,7 @@
 
 #include "boost/algorithm/string.hpp"
 #include "caffe/caffe.hpp"
+#include "caffe/device.hpp"
 
 using caffe::Blob;
 using caffe::Caffe;
@@ -15,7 +16,7 @@ using caffe::Layer;
 using caffe::shared_ptr;
 using caffe::Timer;
 using caffe::vector;
-
+using caffe::amdDevice;
 
 DEFINE_int32(gpu, -1,
     "Run in GPU mode on given device ID.");
@@ -117,7 +118,7 @@ int train() {
     LOG(INFO) << "Use CPU.";
     Caffe::set_mode(Caffe::CPU);
   }
-
+  
   LOG(INFO) << "Starting Optimization";
   shared_ptr<caffe::Solver<float> >
     solver(caffe::GetSolver<float>(solver_param));
@@ -246,6 +247,9 @@ int time() {
   std::vector<double> backward_time_per_layer(layers.size(), 0.0);
   double forward_time = 0.0;
   double backward_time = 0.0;
+
+  clFinish(amdDevice.CommandQueue);
+
   for (int j = 0; j < FLAGS_iterations; ++j) {
     Timer iter_timer;
     iter_timer.Start();
@@ -253,6 +257,9 @@ int time() {
     for (int i = 0; i < layers.size(); ++i) {
       timer.Start();
       layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
+
+      clFinish(amdDevice.CommandQueue);
+
       forward_time_per_layer[i] += timer.MicroSeconds();
     }
     forward_time += forward_timer.MicroSeconds();
@@ -261,6 +268,9 @@ int time() {
       timer.Start();
       layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
                           bottom_vecs[i]);
+      
+      clFinish(amdDevice.CommandQueue);
+      
       backward_time_per_layer[i] += timer.MicroSeconds();
     }
     backward_time += backward_timer.MicroSeconds();

From 34401f6b35b45ecfd985c7754e2344a9f0526556 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 27 Aug 2015 23:34:56 +0800
Subject: [PATCH 040/124] fix cmake

---
 cmake/CaffeConfig.cmake           | 61 +++++++++++++++++++++++++++++++
 cmake/Dependencies.cmake          | 23 +++++++-----
 cmake/OpenCL.cmake                | 26 +++++++++++++
 cmake/Summary.cmake               |  2 +
 cmake/Templates/caffe_config.h.in |  4 ++
 5 files changed, 107 insertions(+), 9 deletions(-)
 create mode 100644 cmake/CaffeConfig.cmake
 create mode 100644 cmake/OpenCL.cmake

diff --git a/cmake/CaffeConfig.cmake b/cmake/CaffeConfig.cmake
new file mode 100644
index 00000000..076edc5d
--- /dev/null
+++ b/cmake/CaffeConfig.cmake
@@ -0,0 +1,61 @@
+# Config file for the Caffe package.
+#
+# Note:
+#   Caffe and this config file depends on opencv,
+#   so put `find_package(OpenCV)` before searching Caffe
+#   via `find_package(Caffe)`. All other lib/includes
+#   dependencies are hard coded in the file
+#
+# After successful configuration the following variables
+# will be defined:
+#
+#   Caffe_INCLUDE_DIRS - Caffe include directories
+#   Caffe_LIBRARIES    - libraries to link against
+#   Caffe_DEFINITIONS  - a list of definitions to pass to compiler
+#
+#   Caffe_HAVE_CUDA    - signals about CUDA support
+#   Caffe_HAVE_CUDNN   - signals about cuDNN support
+
+
+# OpenCV dependency
+
+if(NOT OpenCV_FOUND)
+  set(Caffe_OpenCV_CONFIG_PATH "/usr/local/share/OpenCV")
+  if(Caffe_OpenCV_CONFIG_PATH)
+    get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
+
+    if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
+      message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
+      include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
+    endif()
+
+  else()
+    find_package(OpenCV REQUIRED)
+  endif()
+  unset(Caffe_OpenCV_CONFIG_PATH)
+endif()
+
+# Compute paths
+get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(Caffe_INCLUDE_DIRS "/usr/local/include;/usr/include;/opt/AMDAPPSDK-2.9-1/include;/opt/clBLAS-2.1/include;/usr/local/include/opencv;/usr/include/atlas")
+
+get_filename_component(__caffe_include "${Caffe_CMAKE_DIR}/../../include" ABSOLUTE)
+list(APPEND Caffe_INCLUDE_DIRS ${__caffe_include})
+unset(__caffe_include)
+
+
+# Our library dependencies
+if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
+  include("${Caffe_CMAKE_DIR}/CaffeTargets.cmake")
+endif()
+
+# List of IMPORTED libs created by CaffeTargets.cmake
+set(Caffe_LIBRARIES caffe)
+
+# Definitions
+set(Caffe_DEFINITIONS "-DCPU_ONLY")
+
+# Cuda support variables
+set(Caffe_CPU_ONLY OFF)
+set(Caffe_HAVE_CUDA FALSE)
+set(Caffe_HAVE_CUDNN FALSE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7c86dd55..c4026084 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -44,17 +44,22 @@ include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
 list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
 
 # ---[ CUDA
-include(cmake/Cuda.cmake)
-if(NOT HAVE_CUDA)
-  if(CPU_ONLY)
-    message("-- CUDA is disabled. Building without it...")
-  else()
-    message("-- CUDA is not detected by cmake. Building without it...")
-  endif()
+#include(cmake/Cuda.cmake)
+#if(NOT HAVE_CUDA)
+#  if(CPU_ONLY)
+#    message("-- CUDA is disabled. Building without it...")
+#  else()
+#    message("-- CUDA is not detected by cmake. Building without it...")
+#  endif()
 
   # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
-  add_definitions(-DCPU_ONLY)
-endif()
+#  add_definitions(-DCPU_ONLY)
+#endif()
+
+# ---[ OpenCL
+include(cmake/OpenCL.cmake)
+include_directories(SYSTEM ${OCL_INCLUDE_DIR} ${CLBLAS_INCLUDE_DIR})
+list(APPEND Caffe_LINKER_LIBS ${OCL_LIBRARIES} ${CLBLAS_LIBRARIES})
 
 # ---[ OpenCV
 find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake
new file mode 100644
index 00000000..e6d94642
--- /dev/null
+++ b/cmake/OpenCL.cmake
@@ -0,0 +1,26 @@
+if(CPU_ONLY)
+  return()
+endif()
+
+#find_path(OCL_INCLUDE_DIR  NAMES  CL/cl.h PATHS "$ENV{AMDAPPSDKROOT}/include")
+#find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS "$ENV{AMDAPPSDKROOT}/lib/x86_64")
+
+#find_path(CLBLAS_INCLUDE_DIR  NAMES clBLAS.h     PATHS /opt/clBLAS-2.1/include  $ENV{C_INCLUDE_PATH} $ENV{CPLUS_INCLUDE_PATH})
+#find_library(CLBLAS_LIBRARIES NAMES libclBLAS.so PATHS $ENV{LD_LIBRARY_PATH})
+
+#if(OCL_INCLUDE_DIR AND OCL_LIBRARIES)
+#    set(OCL_FOUND  TRUE PARENT_SCOPE)
+#    message(STATUS "Found OpenCL (include: ${OCL_INCLUDE_DIR}, library: ${OCL_LIBRARIES})")
+#endif()
+
+#if(CLBLAS_INCLUDE_DIR AND CLBLAS_LIBRARIES)
+#    set(CLBLAS_FOUND  TRUE PARENT_SCOPE)
+#endif()
+
+set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include)
+set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so)
+set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include)
+set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so)
+
+
+
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index e094ac00..19782add 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -126,6 +126,8 @@ function(caffe_print_configuration_summary)
   caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
   caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
   caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
+#  caffe_status("  OpenCL            : " OCL_FOUND THEN "Yes" ELSE "No")
+#  caffe_status("  clBLAS            : " CLBLAS_FOUND THEN "Yes" ELSE "No")
   caffe_status("")
   if(HAVE_CUDA)
     caffe_status("NVIDIA CUDA:")
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 6039e8f6..ca9a3a9a 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -14,6 +14,10 @@
 /* NVIDA cuDNN */
 #cmakedefine CPU_ONLY
 
+/* OpenCL & clBLAS*/
+#cmakedefine OCL_FOUND
+#cmakedefine CLBLAS_FOUND
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
 

From dfa3955728b3edcb29034e7dd36ba4590fc78eea Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao@noplz.name>
Date: Fri, 28 Aug 2015 14:58:45 +0800
Subject: [PATCH 041/124] Remove cuda related code

---
 include/caffe/common.hpp                | 12 ++++++------
 include/caffe/util/benchmark.hpp        |  4 ++--
 include/caffe/util/device_alternate.hpp | 18 +++++++++---------
 src/caffe/common.cpp                    | 12 ++++++------
 src/caffe/util/math_functions.cpp       |  2 +-
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 8113c181..b93e0d6d 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -218,10 +218,10 @@ class Caffe {
     return *(Get().random_generator_);
   }
 #ifndef CPU_ONLY
-  inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  inline static curandGenerator_t curand_generator() {
-    return Get().curand_generator_;
-  }
+  //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+  //inline static curandGenerator_t curand_generator() {
+  //  return Get().curand_generator_;
+  //}
 #endif
 
   // Returns the mode: running on CPU or GPU.
@@ -245,8 +245,8 @@ class Caffe {
 
  protected:
 #ifndef CPU_ONLY
-  cublasHandle_t cublas_handle_;
-  curandGenerator_t curand_generator_;
+  //cublasHandle_t cublas_handle_;
+  //curandGenerator_t curand_generator_;
 #endif
   shared_ptr<RNG> random_generator_;
 
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
index d6358277..890f31bf 100644
--- a/include/caffe/util/benchmark.hpp
+++ b/include/caffe/util/benchmark.hpp
@@ -28,8 +28,8 @@ class Timer {
   bool running_;
   bool has_run_at_least_once_;
 #ifndef CPU_ONLY
-  cudaEvent_t start_gpu_;
-  cudaEvent_t stop_gpu_;
+  //cudaEvent_t start_gpu_;
+  //cudaEvent_t stop_gpu_;
 #endif
   boost::posix_time::ptime start_cpu_;
   boost::posix_time::ptime stop_cpu_;
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 6ea595db..9184f4f9 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -31,11 +31,11 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 
 #else  // Normal GPU + CPU Caffe.
 
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <driver_types.h>  // cuda driver types
+//#include <cublas_v2.h>
+//#include <cuda.h>
+//#include <cuda_runtime.h>
+//#include <curand.h>
+//#include <driver_types.h>  // cuda driver types
 #ifdef USE_CUDNN  // cuDNN acceleration library.
 #include "caffe/util/cudnn.hpp"
 #endif
@@ -45,8 +45,8 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 //
 
 // CUDA: various checks for different function calls.
+/*
 #define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
   do { \
     cudaError_t error = condition; \
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
@@ -74,12 +74,12 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 
 // CUDA: check for error after kernel execution and exit loudly if there is one.
 #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
-
+*/
 namespace caffe {
 
 // CUDA: library error reporting.
-const char* cublasGetErrorString(cublasStatus_t error);
-const char* curandGetErrorString(curandStatus_t error);
+//const char* cublasGetErrorString(cublasStatus_t error);
+//const char* curandGetErrorString(curandStatus_t error);
 
 // CUDA: thread number configuration.
 // Use 1024 threads per block, which requires cuda sm_2x or above,
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 5d56493b..3891852a 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -223,7 +223,7 @@ void* Caffe::RNG::generator() {
   return static_cast<void*>(generator_->rng());
 }
 
-const char* cublasGetErrorString(cublasStatus_t error) {
+//const char* cublasGetErrorString(cublasStatus_t error) {
  /* switch (error) {
   case CUBLAS_STATUS_SUCCESS:
     return "CUBLAS_STATUS_SUCCESS";
@@ -251,10 +251,10 @@ const char* cublasGetErrorString(cublasStatus_t error) {
 #endif
   }
 */
-  return "Unknown cublas status";
-}
+//  return "Unknown cublas status";
+//}
 
-const char* curandGetErrorString(curandStatus_t error) {
+//const char* curandGetErrorString(curandStatus_t error) {
   /*switch (error) {
   case CURAND_STATUS_SUCCESS:
     return "CURAND_STATUS_SUCCESS";
@@ -284,8 +284,8 @@ const char* curandGetErrorString(curandStatus_t error) {
     return "CURAND_STATUS_INTERNAL_ERROR";
   }
 */
-  return "Unknown curand status";
-}
+ // return "Unknown curand status";
+//}
 
 #endif  // CPU_ONLY
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3bef8b63..d48ec01a 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -765,7 +765,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
     if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
       // NOLINT_NEXT_LINE(caffe/alt_fn)
-      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
 #else
       NO_GPU;
 #endif

From 415f603ec57b7f1f35cfe361fba8f0ff09ba1023 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 30 Aug 2015 16:36:53 +0800
Subject: [PATCH 042/124] add FindOpenCL and FindclBLAS in cmake/

---
 cmake/Dependencies.cmake       |  11 +++-
 cmake/Modules/FindOpenCL.cmake | 108 +++++++++++++++++++++++++++++++++
 cmake/Modules/FindclBLAS.cmake |  98 ++++++++++++++++++++++++++++++
 cmake/OpenCL.cmake             |   4 +-
 cmake/Summary.cmake            |   6 +-
 5 files changed, 219 insertions(+), 8 deletions(-)
 create mode 100644 cmake/Modules/FindOpenCL.cmake
 create mode 100644 cmake/Modules/FindclBLAS.cmake

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index c4026084..eb72e89f 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -57,9 +57,14 @@ list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
 #endif()
 
 # ---[ OpenCL
-include(cmake/OpenCL.cmake)
-include_directories(SYSTEM ${OCL_INCLUDE_DIR} ${CLBLAS_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${OCL_LIBRARIES} ${CLBLAS_LIBRARIES})
+find_package(OpenCL REQUIRED)
+include_directories(SYSTEM ${OPENCL_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS ${OPENCL_LIBRARIES})
+
+# ---[ clBLAS
+find_package(clBLAS REQUIRED)
+include_directories(SYSTEM ${CLBLAS_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS ${CLBLAS_LIBRARIES})
 
 # ---[ OpenCV
 find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
new file mode 100644
index 00000000..7c23701d
--- /dev/null
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -0,0 +1,108 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an OpenCL implementation.
+# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/)
+#
+# Defines the following variables:
+#
+#   OPENCL_FOUND - Found the OPENCL framework
+#   OPENCL_INCLUDE_DIRS - Include directories
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   OPENCL_LIBRARIES - libopencl
+#
+# Accepts the following variables as input:
+#
+#   OPENCL_ROOT - (as a CMake or environment variable)
+#                The root directory of the OpenCL implementation found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for
+#                              64bit or 32bit libs
+#-----------------------
+# Example Usage:
+#
+#    find_package(OPENCL REQUIRED)
+#    include_directories(${OPENCL_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${OPENCL_LIBRARIES})
+#
+#-----------------------
+
+set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
+
+find_path(OPENCL_INCLUDE_DIRS
+    NAMES OpenCL/cl.h CL/cl.h
+    HINTS
+        ${OPENCL_ROOT}/include
+        $ENV{AMDAPPSDKROOT}/include
+        $ENV{CUDA_PATH}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+        /usr/local/cuda/include
+        /opt/cuda/include
+    DOC "OpenCL header file path"
+)
+mark_as_advanced( OPENCL_INCLUDE_DIRS )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+if( LIB64 )
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
+            ${OPENCL_ROOT}/lib
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86_64 x64
+        PATHS
+            /usr/lib
+            /usr/local/cuda/lib
+            /opt/cuda/lib
+    )
+else( )
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
+            ${OPENCL_ROOT}/lib
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86 Win32
+        PATHS
+            /usr/lib
+            /usr/local/cuda/lib
+            /opt/cuda/lib
+    )
+endif( )
+mark_as_advanced( OPENCL_LIBRARIES )
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
+
+if( NOT OPENCL_FOUND )
+    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+else ()
+    message( STATUS "Found OpenCL  (include: ${OPENCL_INCLUDE_DIRS}, library: ${OPENCL_LIBRARIES})")
+endif()
diff --git a/cmake/Modules/FindclBLAS.cmake b/cmake/Modules/FindclBLAS.cmake
new file mode 100644
index 00000000..1fa28762
--- /dev/null
+++ b/cmake/Modules/FindclBLAS.cmake
@@ -0,0 +1,98 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an clBLAS library.
+#
+# Defines the following variables:
+#
+#   CLBLAS_FOUND - Found the CLBLAS library
+#   CLBLAS_INCLUDE_DIRS - Include directories
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   CLBLAS_LIBRARIES - libclBLAS
+#
+# Accepts the following variables as input:
+#
+#   CLBLAS_ROOT - (as a CMake or environment variable)
+#                The root directory of the clBLAS library found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findclBLAS should search for
+#                              64bit or 32bit libs
+#-----------------------
+# Example Usage:
+#
+#    find_package(clBLAS REQUIRED)
+#    include_directories(${CLBLAS_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${CLBLAS_LIBRARIES})
+#
+#-----------------------
+
+set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
+
+find_path(CLBLAS_INCLUDE_DIRS  NAMES clBLAS.h  
+    HINTS
+        $ENV{CLBLAS_ROOT}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+    DOC "clBLAS header file path"
+)
+mark_as_advanced( CLBLAS_INCLUDE_DIRS )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+if( LIB64 )
+    find_library( CLBLAS_LIBRARIES
+        NAMES clBLAS
+        HINTS
+            $ENV{CLBLAS_ROOT}/lib64
+        DOC "clBLAS dynamic library path"
+        PATHS
+            /usr/lib
+            /usr/local/lib
+    )
+else( )
+    find_library( CLBLAS_LIBRARIES
+        NAMES clBLAS
+        HINTS
+            $ENV{CLBLAS_ROOT}/lib
+        DOC "clBLAS dynamic library path"
+        PATHS
+            /usr/lib
+            /usr/local/lib
+    )
+endif( )
+mark_as_advanced( CLBLAS_LIBRARIES )
+
+if (NOT CLBLAS_INCLUDE_DIRS)
+   set(CLBLAS_FOUND ON)
+endif()
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS )
+
+if( NOT CLBLAS_FOUND )
+    message( STATUS "FindclBLAS looked for libraries named: clBLAS" )
+else ()
+    message( STATUS "Found clBLAS  (include: ${CLBLAS_INCLUDE_DIRS}, library: ${CLBLAS_LIBRARIES})")
+endif()
diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake
index e6d94642..c83ce7eb 100644
--- a/cmake/OpenCL.cmake
+++ b/cmake/OpenCL.cmake
@@ -17,8 +17,8 @@ endif()
 #    set(CLBLAS_FOUND  TRUE PARENT_SCOPE)
 #endif()
 
-set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include)
-set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so)
+#set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include)
+#set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so)
 set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include)
 set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so)
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 19782add..2d95b0a9 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -125,9 +125,9 @@ function(caffe_print_configuration_summary)
   caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
   caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
   caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
-  caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
-#  caffe_status("  OpenCL            : " OCL_FOUND THEN "Yes" ELSE "No")
-#  caffe_status("  clBLAS            : " CLBLAS_FOUND THEN "Yes" ELSE "No")
+#  caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
+  caffe_status("  OpenCL            : " OPENCL_FOUND THEN "Yes" ELSE "No")
+  caffe_status("  clBLAS            : " CLBLAS_FOUND THEN "Yes" ELSE "No")
   caffe_status("")
   if(HAVE_CUDA)
     caffe_status("NVIDIA CUDA:")

From 17104ed502ae133bab908aa967c4dc1e395ca26f Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 14:50:21 +0800
Subject: [PATCH 043/124] Fixed conv layers opt2 bug

---
 include/caffe/common.hpp             |   2 +-
 include/caffe/vision_layers.hpp      |   8 ++-
 src/caffe/device.cpp                 |  42 +----------
 src/caffe/layers/base_conv_layer.cpp | 101 ++++++++-------------------
 src/caffe/layers/conv_layer.cpp      |  47 ++++---------
 5 files changed, 52 insertions(+), 148 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index b93e0d6d..97d1a985 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -81,7 +81,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1
+#define use_packing_scheme 1 
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 237e9cbf..2f2d7eef 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -105,6 +105,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
     col2im_gpu(col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
   }
+ protected:
   inline void conv_im2col_gpu_opt(const Dtype* data) {
      im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
@@ -113,11 +114,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
     col2im_gpu_opt((Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 }
+ private:
   inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
-    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
+    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2);
 }
  inline void conv_transpose_gpu(const Dtype* data){
-    opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
+    opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 }
 protected:
   inline void gpu_memset(Dtype* data, Dtype value, int count) {
@@ -147,7 +149,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   int weight_offset_;
   int col_offset_;
   int output_offset_;
-  int top_offset_, top_offset_n, bottom_offset_;
+  int top_offset_, top_offset_opt, bottom_offset_;
 public:
   static cl_mem subTopMem, transMem;
   static size_t subtop_mem_size, trans_mem_size;
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 7a866c11..960d8bf1 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -75,9 +75,7 @@ cl_int Device::Init(){
     GetDeviceInfo();
     cl_uint uiNumDevices;
     cl_bool unified_memory = false;
-/*    switch(Caffe::mode()) {
-    case Caffe::GPU:
-         //choose_gpu();
+
       clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
       uiNumDevices = numDevices;
       if(0 == uiNumDevices){
@@ -95,44 +93,6 @@ cl_int Device::Init(){
             }
          }
        }
-         LOG(INFO) << "picked device type: GPU";
-         break;
-    case Caffe::CPU:
-         //choose_cpu();
-         clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
-         uiNumDevices = numDevices;
-        if(0 == uiNumDevices){
-          LOG(FATAL) << "Err: No CPU devices";
-          }
-         pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
-         OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) );
-         LOG(INFO) << "picked device type: CPU";
-         break;
-*/  
-//  case Caffe::APU:
-        clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-        uiNumDevices = numDevices;
-        if(0 == uiNumDevices){
-          LOG(FATAL) << "Err: No GPU devices";
-         }
-         else{
-          pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
-          OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
-          for (int i = 0; i < (int)uiNumDevices; i++){
-            clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
-             if(unified_memory) //we pick the first GPU we found
-              pDevices[0] = pDevices[i];
-             else {//skip dGPU
-               continue;
-               }
-         }
-       }
-         LOG(INFO) << "picked device type: APU";
-  //       break;
-  //  default:
-  //       LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  //  }
-
     //Create Context
     Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
     if(NULL == Context){
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 1c1379b3..faa7b63c 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -33,19 +33,9 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-/*  im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL);
-  col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL);
-  oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
-  im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
-  col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL);
-  opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL);
-  ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL);
-  ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL);
-*/
-  M_ = conv_out_channels_ / group_;
-  K_ = kernel_dim_ / group_;
-  N_ =  conv_out_spatial_dim_;
-
+  M_ = num_output_ / group_;
+  K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
+  N_ = height_out_ * width_out_;
 #ifdef use_packing_scheme
   size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
   size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
@@ -56,15 +46,6 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 
 template <typename Dtype>
  BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
- /*
-  OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) );
-  OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) );
-  OCL_CHECK( clReleaseKernel(oclmem_kernel) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
-  OCL_CHECK( clReleaseKernel(im2col_opt_kernel) );
-  OCL_CHECK( clReleaseKernel(col2im_opt_kernel) );
-*/
 }
 
 
@@ -314,9 +295,10 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      conv_im2col_gpu_opt(input);
+      //conv_im2col_gpu_opt(input);
+      im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
+                 (Dtype*)transMem, 0, opt_num2);
     }   
-    //col_buff = col_buffer_.gpu_data();
   }
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
@@ -324,7 +306,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
        else Queue =  amdDevice.CommandQueue_helper;
        prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
      if(group_ == 2){
        clFinish(amdDevice.CommandQueue);
@@ -335,10 +317,11 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     for (int g = 0; g < group_; ++g) {
        prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
 #endif
-   conv_transform_gpu((Dtype*)subTopMem, output);
+   //conv_transform_gpu((Dtype*)subTopMem, output);
+   transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2);
 }
 
 
@@ -358,7 +341,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
       caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
           N_, 1, (Dtype)1., bias, 0,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., output, top_offset_n + num_output_ * N_ * z);
+          (Dtype)1., output, top_offset_ + num_output_ * N_ * z);
 }
 
 template <typename Dtype>
@@ -371,7 +354,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
   for (int g = 0; g < group_; ++g) {
         caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
           (Dtype)1., weights,  weight_offset_ * g,
-          output, top_offset_+output_offset_ * g,
+          output, top_offset_ + output_offset_ * g,
           (Dtype)0., col_buff, col_offset_ * g);
   }
   if (!is_1x1_) {
@@ -382,7 +365,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
     const Dtype* weights, Dtype* input) {
-  //Dtype* col_buff = col_buffer_.mutable_gpu_data();
   cl_command_queue Queue;
   if (is_1x1_) {
     int count = height_ * width_ * conv_in_channels_ * opt_num2;
@@ -395,9 +377,9 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
           (Dtype)1., weights,  weight_offset_ * g,
-          (Dtype*)subTopMem, top_offset_ * g,
+          (Dtype*)subTopMem, top_offset_opt * g,
           (Dtype)0., (Dtype*)transMem, col_offset_ * g);
       }
 #ifdef multiQ
@@ -408,8 +390,10 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #endif
 
   if (!is_1x1_) {
-      conv_col2im_gpu_opt(input);
-  }
+      //conv_col2im_gpu_opt(input);
+      col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+                  stride_w_, input, bottom_offset_, opt_num2);
+   }
 }
 
 template <typename Dtype>
@@ -433,10 +417,14 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   const Dtype* col_buff = input;
   cl_command_queue Queue;
   if (!is_1x1_) {
-    conv_im2col_gpu_opt(input);
-    //col_buff = col_buffer_.gpu_data();
+    //conv_im2col_gpu_opt(input);
+   im2col_gpu_opt(input, bottom_offset_, channels_, height_,
+                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
-    conv_transpose_gpu(output);
+    //conv_transpose_gpu(output);
+    int height_top = M_ * group_, width_top = N_;
+    opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
+
 
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -445,8 +433,8 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2,
-        (Dtype)1., (Dtype*)subTopMem, top_offset_ * g,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+        (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g,
         (Dtype*)transMem, col_offset_ * g, (Dtype)1.,
         (Dtype*)weights, weight_offset_ * g);
 #ifdef multiQ
@@ -461,10 +449,8 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
- /* caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.gpu_data(), 1., bias);*/
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_*width_out_,
-          (Dtype)1., input, top_offset_, height_out_*width_out_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 
+          (Dtype)1., input, top_offset_, N_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias, (size_t)0, 1);
 }
@@ -475,12 +461,9 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
 
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
-     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
     Dtype* top_data = top[i]->mutable_gpu_data();
 
   Dtype* col_data = col_buffer_.mutable_gpu_data();
-  /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. 
-   N' is the M in sgemm call.*/
   int M_org = M_ * group_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
@@ -488,19 +471,13 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
   int opt_num2 = global_packing_N;
   cl_command_queue Queue;
   cl_event prof_event;
-  //LOG(INFO) << "conv_fp optimized scheme";
   for (int n = 0; n < num_; n += opt_num2) {
     opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    /*col_offset is the offset for sgemm, including packing and groups
-    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
     top_offset = M_ * N_ * opt_num2;
     col_offset = K_ * N_ * opt_num2;
-    //step1: packed im2col, col_size = (K_ * group_ ) * N_
-    //this should be opt_num2 images packing together.
     im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
-    //step 2: sgemm: Top (subTopMem) = weight * col_data
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
        if(g == 0) Queue = amdDevice.CommandQueue;
@@ -521,10 +498,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
           (Dtype)0., (Dtype*)subTopMem, top_offset * g);
        }
 #endif
-    //step 3: tranform
     transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
-    //step 4: add bias
-    /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/
 
    for (int z = 0; z < opt_num2; z++)
       if (bias_term_) {
@@ -551,7 +525,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
     for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, M_, N_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
           (Dtype)1., top_diff, top[i]->offset(n), N_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias_diff, (size_t)0, 1);
@@ -570,25 +544,17 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
   int g = 0;
   cl_command_queue Queue;
   cl_event prof_event;
-  //LOG(INFO) << "conv_bp optimized scheme";
 
   for (int n = 0; n < num_; n += opt_num2) {
     opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    /*col_offset is the offset for sgemm, including packing and groups
-    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
     top_offset = M_ * (N_ * opt_num2);
     col_offset = K_ * (N_ * opt_num2);
-    //step1: packed im2col, col_size = (K_ * group_ ) * N_
-    //this should be opt_num2 images packing together.
     im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
-    //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize
     int height_top = M_ * group_, width_top = N_;
-    //if (opt_num2 >1)
     opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
 
-    //step 3: sgemm: Top (subTopMem) = weight * col_data
     for(g = 0; g < group_; ++g) {
 #ifdef multiQ
        if(g == 0) Queue = amdDevice.CommandQueue;
@@ -602,7 +568,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
         (Dtype*)weight_diff, weight_offset * g);
     }
 
-   //step4:
    if (propagate_down[i]) {
       for (g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -624,14 +589,8 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
       clFinish(amdDevice.CommandQueue_helper);
     }
 #endif
-
-    //step5: col2im
        col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
                   stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
-#ifdef Track_layer
-    LOG(WARNING) << "conv bp done";
-#endif
-
    }
   }
  }
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 8f7d8f82..369fbacd 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -1,5 +1,4 @@
 #include <vector>
-
 #include "caffe/filler.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/util/im2col.hpp"
@@ -33,7 +32,7 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
 
-//  CHECK_BLOB_DATA(top[0],20, "top[0]");
+ // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -67,9 +66,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-  //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
-  //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff");
 
 }
 
@@ -80,7 +76,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    Forward_gpu_opt(bottom, top);
   else
    Forward_gpu_org(bottom, top);
-// CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -97,11 +92,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
       const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   this->forward_gpu_opt(bottom, weight, top);
-
-#ifdef Track_layer
-  LOG(WARNING) << "conv fp done";
-#endif
-
 }
 
 template <typename Dtype>
@@ -114,14 +104,14 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
 
     Dtype* top_data = top[i]->mutable_gpu_data();
     this->opt_num2 = global_packing_N;
+    this->weight_offset_ = this->M_ * this->K_;
     for (int n = 0; n < this->num_; n += this->opt_num2) {
       this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
        //intermediate variables to pass offset
-      this->top_offset_ = this->M_ * this->N_ * this->opt_num2;
-      this->top_offset_n = top[i]->offset(n);
+      this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
+      this->top_offset_ = top[i]->offset(n);
       this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
       this->bottom_offset_ = bottom[i]->offset(n);
-      this->weight_offset_ = this->M_ * this->K_;
       this->forward_gpu_gemm_opt(bottom_data, weight,
             top_data);
       if (this->bias_term_) {
@@ -131,8 +121,8 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
    }
   }
 
-  CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-  CHECK_BLOB_DATA(top[0],20, "top[0]");
+  //CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
 
 }
 
@@ -160,7 +150,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   }
 
   // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
- // CHECK_BLOB_DATA(top[0],20, "top[0]");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -180,30 +170,31 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
     // Bias gradient, if necessary.
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      this->gpu_memset(bias_diff, 0., this->blobs_[1]->count());
+      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
       for (int n = 0; n < this->num_; ++n) {
-       //
         this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
         this->backward_gpu_bias(bias_diff, top_diff);
       }
-    }
+     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->gpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
       this->weight_offset_ = this->M_ * this->K_;
       this->opt_num2 = global_packing_N;
-      for (int n = 0; n < this->num_; ++n) {
+      for (int n = 0; n < this->num_; n += this->opt_num2) {
         this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
-        this->top_offset_n = top[i]->offset(n);
+        this->top_offset_ = top[i]->offset(n);
         this->bottom_offset_ = bottom[i]->offset(n);
         this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_ = this->M_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           this->weight_gpu_gemm_opt(bottom_data,
               top_diff, weight_diff);
         }
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
           this->backward_gpu_gemm_opt(top_diff, weight,
@@ -213,10 +204,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
     }
   }
 
-  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
-  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
-  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
@@ -256,10 +243,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
     }
   }
   
-//  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
-//  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-//  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
- // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY

From 33b8282220b218da52e8b4c738ca680a887e7dcc Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 15:29:44 +0800
Subject: [PATCH 044/124] conv clean up

---
 src/caffe/layers/base_conv_layer.cpp | 23 +++++++++--------------
 src/caffe/layers/conv_layer.cpp      |  7 ++-----
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index faa7b63c..6071c49b 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -295,9 +295,9 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      //conv_im2col_gpu_opt(input);
-      im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
-                 (Dtype*)transMem, 0, opt_num2);
+      conv_im2col_gpu_opt(input);
+     // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
+        //         (Dtype*)transMem, 0, opt_num2);
     }   
   }
 #ifdef multiQ
@@ -390,9 +390,9 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #endif
 
   if (!is_1x1_) {
-      //conv_col2im_gpu_opt(input);
-      col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-                  stride_w_, input, bottom_offset_, opt_num2);
+      conv_col2im_gpu_opt(input);
+     // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+       //           stride_w_, input, bottom_offset_, opt_num2);
    }
 }
 
@@ -414,12 +414,11 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
     const Dtype* output, Dtype* weights) {
-  const Dtype* col_buff = input;
   cl_command_queue Queue;
   if (!is_1x1_) {
-    //conv_im2col_gpu_opt(input);
-   im2col_gpu_opt(input, bottom_offset_, channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+    conv_im2col_gpu_opt(input);
+   //im2col_gpu_opt(input, bottom_offset_, channels_, height_,
+     //                  width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
     //conv_transpose_gpu(output);
     int height_top = M_ * group_, width_top = N_;
@@ -462,8 +461,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
-
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
   int M_org = M_ * group_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
@@ -535,8 +532,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
  if (this->param_propagate_down_[0] || propagate_down[i]) {
   const Dtype* bottom_data = bottom[i]->gpu_data();
   Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
   int weight_offset = M_ * K_;
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 369fbacd..020098aa 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -73,7 +73,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt(bottom, top);
+   Forward_gpu_opt2(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
@@ -82,7 +82,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     if (use_packing_scheme && global_packing_N >1)
-      Backward_gpu_opt(top, propagate_down, bottom);
+      Backward_gpu_opt2(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
 }
@@ -192,9 +192,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
           this->weight_gpu_gemm_opt(bottom_data,
               top_diff, weight_diff);
         }
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
           this->backward_gpu_gemm_opt(top_diff, weight,

From 40cdc3e41abcde36b8816f3ce3556c230633992f Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 22:52:23 +0800
Subject: [PATCH 045/124] removed all cuDNN files

---
 src/caffe/layers/base_conv_layer.cpp          |   6 -
 src/caffe/layers/cudnn_conv_layer.cpp         | 130 ------
 src/caffe/layers/cudnn_pooling_layer.cpp      |  50 ---
 src/caffe/layers/cudnn_relu_layer.cpp         |  46 ---
 src/caffe/layers/cudnn_sigmoid_layer.cpp      |  46 ---
 src/caffe/layers/cudnn_softmax_layer.cpp      |  50 ---
 src/caffe/layers/cudnn_tanh_layer.cpp         |  46 ---
 src/caffe/layers/cufiles/absval_layer.cu      |  33 --
 src/caffe/layers/cufiles/base_data_layer.cu   |  30 --
 src/caffe/layers/cufiles/bnll_layer.cu        |  60 ---
 src/caffe/layers/cufiles/concat_layer.cu      |  71 ----
 .../layers/cufiles/contrastive_loss_layer.cu  | 111 -----
 src/caffe/layers/cufiles/conv_layer.cu        |  64 ---
 src/caffe/layers/cufiles/cudnn_conv_layer.cu  | 160 --------
 .../layers/cufiles/cudnn_pooling_layer.cu     |  45 --
 src/caffe/layers/cufiles/cudnn_relu_layer.cu  |  57 ---
 .../layers/cufiles/cudnn_sigmoid_layer.cu     |  47 ---
 .../layers/cufiles/cudnn_softmax_layer.cu     |  48 ---
 src/caffe/layers/cufiles/cudnn_tanh_layer.cu  |  48 ---
 src/caffe/layers/cufiles/deconv_layer.cu      |  64 ---
 src/caffe/layers/cufiles/dropout_layer.cu     |  77 ----
 src/caffe/layers/cufiles/eltwise_layer.cu     | 135 ------
 .../layers/cufiles/euclidean_loss_layer.cu    |  44 --
 src/caffe/layers/cufiles/exp_layer.cu         |  44 --
 src/caffe/layers/cufiles/filter_layer.cu      |  70 ----
 src/caffe/layers/cufiles/hdf5_data_layer.cu   |  53 ---
 src/caffe/layers/cufiles/hdf5_output_layer.cu |  43 --
 src/caffe/layers/cufiles/im2col_layer.cu      |  37 --
 .../layers/cufiles/inner_product_layer.cu     |  57 ---
 src/caffe/layers/cufiles/log_layer.cu         |  57 ---
 src/caffe/layers/cufiles/lrn_layer.cu         | 203 ---------
 src/caffe/layers/cufiles/mvn_layer.cu         | 124 ------
 src/caffe/layers/cufiles/pooling_layer.cu     | 387 ------------------
 src/caffe/layers/cufiles/power_layer.cu       |  87 ----
 src/caffe/layers/cufiles/prelu_layer.cu       | 128 ------
 src/caffe/layers/cufiles/reduction_layer.cu   |  93 -----
 src/caffe/layers/cufiles/relu_layer.cu        |  65 ---
 .../sigmoid_cross_entropy_loss_layer.cu       |  37 --
 src/caffe/layers/cufiles/sigmoid_layer.cu     |  62 ---
 src/caffe/layers/cufiles/silence_layer.cu     |  28 --
 src/caffe/layers/cufiles/slice_layer.cu       |  71 ----
 src/caffe/layers/cufiles/softmax_layer.cu     | 149 -------
 .../layers/cufiles/softmax_loss_layer.cu      | 125 ------
 src/caffe/layers/cufiles/split_layer.cu       |  38 --
 src/caffe/layers/cufiles/tanh_layer.cu        |  59 ---
 src/caffe/layers/cufiles/threshold_layer.cu   |  33 --
 46 files changed, 3518 deletions(-)
 delete mode 100644 src/caffe/layers/cudnn_conv_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_pooling_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_relu_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_sigmoid_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_softmax_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_tanh_layer.cpp
 delete mode 100644 src/caffe/layers/cufiles/absval_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/base_data_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/bnll_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/concat_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/contrastive_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/conv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_conv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_pooling_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_relu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_softmax_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_tanh_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/deconv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/dropout_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/eltwise_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/euclidean_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/exp_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/filter_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/hdf5_data_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/hdf5_output_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/im2col_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/inner_product_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/log_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/lrn_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/mvn_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/pooling_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/power_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/prelu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/reduction_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/relu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/sigmoid_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/silence_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/slice_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/softmax_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/softmax_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/split_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/tanh_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/threshold_layer.cu

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 6071c49b..19458185 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -296,8 +296,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   if (!is_1x1_) {
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
-     // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
-        //         (Dtype*)transMem, 0, opt_num2);
     }   
   }
 #ifdef multiQ
@@ -391,8 +389,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 
   if (!is_1x1_) {
       conv_col2im_gpu_opt(input);
-     // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-       //           stride_w_, input, bottom_offset_, opt_num2);
    }
 }
 
@@ -417,8 +413,6 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   cl_command_queue Queue;
   if (!is_1x1_) {
     conv_im2col_gpu_opt(input);
-   //im2col_gpu_opt(input, bottom_offset_, channels_, height_,
-     //                  width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
     //conv_transpose_gpu(output);
     int height_top = M_ * group_, width_top = N_;
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
deleted file mode 100644
index 104d2b9d..00000000
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// Set to three for the benefit of the backward pass, which
-// can use separate streams for calculating the gradient w.r.t.
-// bias, filter weights, and bottom data for each group independently
-#define CUDNN_STREAMS_PER_GROUP 3
-
-/**
- * TODO(dox) explain cuDNN interface
- */
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDA streams and cuDNN.
-  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  workspaceSizeInBytes = 0;
-  workspace = NULL;
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
-    CUDNN_CHECK(cudnnCreate(&handle_[g]));
-    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
-  }
-
-  // Set the indexing parameters.
-  weight_offset_ = (this->num_output_ / this->group_)
-      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
-  bias_offset_ = (this->num_output_ / this->group_);
-
-  // Create filter descriptor.
-  cudnn::createFilterDesc<Dtype>(&filter_desc_,
-      this->num_output_ / this->group_, this->channels_ / this->group_,
-      this->kernel_h_, this->kernel_w_);
-
-  // Create tensor descriptor(s) for data and corresponding convolution(s).
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnnTensorDescriptor_t bottom_desc;
-    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
-    bottom_descs_.push_back(bottom_desc);
-    cudnnTensorDescriptor_t top_desc;
-    cudnn::createTensor4dDesc<Dtype>(&top_desc);
-    top_descs_.push_back(top_desc);
-    cudnnConvolutionDescriptor_t conv_desc;
-    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
-    conv_descs_.push_back(conv_desc);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
-  }
-
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  bottom_offset_ = (this->channels_ / this->group_)
-      * this->height_ * this->width_;
-  top_offset_ = (this->num_output_ / this->group_)
-      * this->height_out_ * this->width_out_;
-
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
-        this->num_,
-        this->channels_ / this->group_,
-        this->height_, this->width_,
-        this->channels_ * this->height_ * this->width_,
-        this->height_ * this->width_,
-        this->width_, 1);
-    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
-        this->num_,
-        this->num_output_ / this->group_,
-        this->height_out_, this->width_out_,
-        this->num_output_ * this->height_out_ * this->width_out_,
-        this->height_out_ * this->width_out_,
-        this->width_out_, 1);
-    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, this->pad_h_, this->pad_w_,
-        this->stride_h_, this->stride_w_);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
-        1, this->num_output_ / this->group_, 1, 1);
-  }
-}
-
-template <typename Dtype>
-CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  for (int i = 0; i < bottom_descs_.size(); i++) {
-    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
-    cudnnDestroyTensorDescriptor(top_descs_[i]);
-    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
-  }
-  if (this->bias_term_) {
-    cudnnDestroyTensorDescriptor(bias_desc_);
-  }
-  cudnnDestroyFilterDescriptor(filter_desc_);
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    cudaStreamDestroy(stream_[g]);
-    cudnnDestroy(handle_[g]);
-  }
-
-  delete [] stream_;
-  delete [] handle_;
-}
-
-INSTANTIATE_CLASS(CuDNNConvolutionLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
deleted file mode 100644
index c92c4e47..00000000
--- a/src/caffe/layers/cudnn_pooling_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
-      this->layer_param_.pooling_param().pool(), &mode_,
-      this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_,
-      this->stride_h_, this->stride_w_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->pooled_height_, this->pooled_width_);
-}
-
-template <typename Dtype>
-CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroyPoolingDescriptor(pooling_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNPoolingLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
deleted file mode 100644
index 759d8398..00000000
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
deleted file mode 100644
index 32637873..00000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSigmoidLayer<Dtype>::~CuDNNSigmoidLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
deleted file mode 100644
index 77a3225a..00000000
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDNN.
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::Reshape(bottom, top);
-  int N = this->outer_num_;
-  int K = bottom[0]->shape(this->softmax_axis_);
-  int H = this->inner_num_;
-  int W = 1;
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSoftmaxLayer<Dtype>::~CuDNNSoftmaxLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
deleted file mode 100644
index 376faad3..00000000
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNTanHLayer<Dtype>::~CuDNNTanHLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu
deleted file mode 100644
index bb310e1a..00000000
--- a/src/caffe/layers/cufiles/absval_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
-}
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_sign(count, bottom_data, bottom_diff);
-    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu
deleted file mode 100644
index 9335a5bc..00000000
--- a/src/caffe/layers/cufiles/base_data_layer.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <vector>
-
-#include "caffe/data_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(this->prefetch_data_);
-  // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-      top[0]->mutable_gpu_data());
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
-    // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-        top[1]->mutable_gpu_data());
-  }
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-}
-
-INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu
deleted file mode 100644
index d963d068..00000000
--- a/src/caffe/layers/cufiles/bnll_layer.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-const float kBNLL_THRESHOLD = 50.;
-
-template <typename Dtype>
-__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ?
-        in[index] + log(1. + exp(-in[index])) :
-        log(1. + exp(in[index]));
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void BNLLBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD)));
-    out_diff[index] = in_diff[index] * expval / (expval + 1.);
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    BNLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu
deleted file mode 100644
index 8f2e85d8..00000000
--- a/src/caffe/layers/cufiles/concat_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Concat(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index +
-        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    if (forward) {
-      out_data[top_index] = in_data[index];
-    } else {
-      out_data[index] = in_data[top_index];
-    }
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu
deleted file mode 100644
index 93123931..00000000
--- a/src/caffe/layers/cufiles/contrastive_loss_layer.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),  // a
-      bottom[1]->gpu_data(),  // b
-      diff_.mutable_gpu_data());  // a_i-b_i
-  caffe_gpu_powx(
-      count,
-      diff_.mutable_gpu_data(),  // a_i-b_i
-      Dtype(2),
-      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-  caffe_gpu_gemv(
-      CblasNoTrans,
-      bottom[0]->num(),
-      bottom[0]->channels(),
-      Dtype(1.0),
-      diff_sq_.gpu_data(),  // (a_i-b_i)^2
-      summer_vec_.gpu_data(),
-      Dtype(0.0),
-      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
-                              Dtype(0.0));
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-__global__ void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff) {
-  CUDA_KERNEL_LOOP(i, count) {
-    int n = i / channels;  // the num index, to access y and dist_sq
-    if (static_cast<int>(y[n])) {  // similar pairs
-      bottom_diff[i] = alpha * diff[i];
-    } else {  // dissimilar pairs
-      Dtype mdist(0.0);
-      Dtype beta(0.0);
-      if (legacy_version) {
-        mdist = (margin - dist_sq[n]);
-        beta = -alpha;
-      } else {
-        Dtype dist = sqrt(dist_sq[n]);
-        mdist = (margin - dist);
-        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-      }
-      if (mdist > 0.0) {
-        bottom_diff[i] = beta;
-      } else {
-        bottom_diff[i] = 0;
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const int count = bottom[0]->count();
-      const int channels = bottom[0]->channels();
-      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-      const bool legacy_version =
-          this->layer_param_.contrastive_loss_param().legacy_version();
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[0]->num());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, legacy_version, alpha,
-          bottom[2]->gpu_data(),  // pair similarity 0 or 1
-          diff_.gpu_data(),  // the cached eltwise difference between a and b
-          dist_sq_.gpu_data(),  // the cached square distance between a and b
-          bottom[i]->mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu
deleted file mode 100644
index b8a98ff7..00000000
--- a/src/caffe/layers/cufiles/conv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu
deleted file mode 100644
index b4e802e1..00000000
--- a/src/caffe/layers/cufiles/cudnn_conv_layer.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-__global__ void sync_conv_groups() { }
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const Dtype* weight = this->blobs_[0]->gpu_data();
-
-    size_t workspace_limit_bytes = this->kernel_h_ *
-                                   this->kernel_w_ *
-                                   this->channels_ *
-                                   sizeof(int) + 1;
-
-    // Forward through cuDNN in parallel over groups.
-    for (int g = 0; g < this->group_; g++) {
-      cudnnConvolutionFwdAlgo_t algo;
-
-      // pick the convolution algorithm
-      // TODO(shelhamer) this should be done during reshape
-      // TODO(shelhamer) the choice of automatic or manual algorithm picking
-      // should be exposed in proto
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,  // memoryLimitInBytes,
-        &algo));
-
-      // get minimum size of the workspace needed for the desired algorithm
-      size_t workspaceSizeInBytes_temp = 0;
-
-      CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        algo,
-        &workspaceSizeInBytes_temp));
-
-      if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
-        workspaceSizeInBytes = workspaceSizeInBytes_temp;
-        // free the existing workspace and allocate a new (larger) one
-        cudaFree(this->workspace);
-        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
-        if (err != cudaSuccess) {
-          // force zero memory path
-          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-          workspace = NULL;
-          workspaceSizeInBytes = 0;
-        }
-      }
-
-      // Filters.
-      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
-            cudnn::dataType<Dtype>::one,
-            bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + weight_offset_ * g,
-            conv_descs_[i],
-            algo, workspace, workspaceSizeInBytes,
-            cudnn::dataType<Dtype>::zero,
-            top_descs_[i], top_data + top_offset_ * g));
-
-      // Bias.
-      if (this->bias_term_) {
-        const Dtype* bias_data = this->blobs_[1]->gpu_data();
-        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
-  if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  }
-  Dtype* bias_diff = NULL;
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Backward through cuDNN in parallel over groups and gradients.
-    for (int g = 0; g < this->group_; g++) {
-      // Gradient w.r.t. bias.
-      if (this->bias_term_ && this->param_propagate_down_[1]) {
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i],  top_diff + top_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_diff + bias_offset_ * g));
-      }
-
-      // Gradient w.r.t. weights.
-      if (this->param_propagate_down_[0]) {
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              bottom_descs_[i], bottom_data + bottom_offset_ * g,
-              top_descs_[i],    top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + weight_offset_ * g));
-      }
-
-      // Gradient w.r.t. bottom data.
-      if (propagate_down[i]) {
-        if (weight == NULL) {
-          weight = this->blobs_[0]->gpu_data();
-        }
-        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + weight_offset_ * g,
-              top_descs_[i], top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::zero,
-              bottom_descs_[i], bottom_diff + bottom_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu
deleted file mode 100644
index a952b855..00000000
--- a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data, top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu
deleted file mode 100644
index 21d14857..00000000
--- a/src/caffe/layers/cufiles/cudnn_relu_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Forward_gpu(bottom, top);
-  }
-
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Backward_gpu(top, propagate_down, bottom);
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
deleted file mode 100644
index 7a06cf72..00000000
--- a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu
deleted file mode 100644
index a9e2fcef..00000000
--- a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_CHANNEL,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-    CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE,
-          CUDNN_SOFTMAX_MODE_CHANNEL,
-          cudnn::dataType<Dtype>::one,
-          top_desc_, top_data, top_desc_, top_diff,
-          cudnn::dataType<Dtype>::zero,
-          bottom_desc_, bottom_diff));
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu
deleted file mode 100644
index d287f6fe..00000000
--- a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu
deleted file mode 100644
index 39bc4de8..00000000
--- a/src/caffe/layers/cufiles/deconv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu
deleted file mode 100644
index f9ea04f4..00000000
--- a/src/caffe/layers/cufiles/dropout_layer.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-
-template <typename Dtype>
-__global__ void DropoutForward(const int n, const Dtype* in,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] * (mask[index] > threshold) * scale;
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    unsigned int* mask =
-        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
-    caffe_gpu_rng_uniform(count, mask);
-    // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, mask, uint_thres_, scale_, top_data);
-    CUDA_POST_KERNEL_CHECK;
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-  }
-}
-
-template <typename Dtype>
-__global__ void DropoutBackward(const int n, const Dtype* in_diff,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    if (this->phase_ == TRAIN) {
-      const unsigned int* mask =
-          static_cast<const unsigned int*>(rand_vec_.gpu_data());
-      const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
-      CUDA_POST_KERNEL_CHECK;
-    } else {
-      caffe_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu
deleted file mode 100644
index 2247870d..00000000
--- a/src/caffe/layers/cufiles/eltwise_layer.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    if (bottom_data_a[index] > bottom_data_b[index]) {
-      // only update for very first bottom_data blob (blob_idx == 0)
-      if (blob_idx == 0) {
-        maxval = bottom_data_a[index];
-        top_data[index] = maxval;
-        maxidx = blob_idx;
-        mask[index] = maxidx;
-      }
-    } else {
-      maxval = bottom_data_b[index];
-      top_data[index] = maxval;
-      maxidx = blob_idx + 1;
-      mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-        top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    mask = max_idx_.mutable_gpu_data();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxForward<Dtype> <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
-    for (int i = 2; i < bottom.size(); ++i) {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
-}
-
-template <typename Dtype>
-__global__ void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype gradient = 0;
-    if (mask[index] == blob_idx) {
-      gradient += top_diff[index];
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-                            bottom_diff);
-            }
-          }
-        } else {
-          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.gpu_data();
-        MaxBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-            count, top_diff, i, mask, bottom_diff);
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu
deleted file mode 100644
index 5b1de3ad..00000000
--- a/src/caffe/layers/cufiles/euclidean_loss_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_gpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_gpu_diff());  // b
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu
deleted file mode 100644
index 2d75d8dd..00000000
--- a/src/caffe/layers/cufiles/exp_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_gpu_exp(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_gpu_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, outer_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, inner_scale_, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu
deleted file mode 100644
index cf929eee..00000000
--- a/src/caffe/layers/cufiles/filter_layer.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->gpu_data();
-    Dtype* top_data = top[t]->mutable_gpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * dim;
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
-}
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); ++n) {
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          data_offset_bottom = n * dim;
-          caffe_gpu_set(dim, Dtype(0),
-              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          data_offset_bottom = n * dim;
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_gpu_set(dim, Dtype(0),
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            ++next_to_backward_offset;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu
deleted file mode 100644
index 5e3e4ced..00000000
--- a/src/caffe/layers/cufiles/hdf5_data_layer.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
-*/
-
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/data_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    }
-    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-      int data_dim = top[j]->count() / top[j]->shape(0);
-      caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu
deleted file mode 100644
index ae497c34..00000000
--- a/src/caffe/layers/cufiles/hdf5_output_layer.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
-
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim],
-        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-    caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim],
-        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
-  }
-  SaveBlobs();
-}
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  return;
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu
deleted file mode 100644
index 9c338b14..00000000
--- a/src/caffe/layers/cufiles/im2col_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
-  }
-}
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu
deleted file mode 100644
index d93560a0..00000000
--- a/src/caffe/layers/cufiles/inner_product_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
-      bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
-  if (bias_term_) {
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),0,
-        this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
-  }
-}
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    // Gradient with respect to weight
-    caffe_gpu_gemm_ex<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bias
-    caffe_gpu_gemvv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
-        (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-         (size_t)0, (Dtype)0., 1,
-        this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
-        bottom[0]->mutable_gpu_diff(), 0);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu
deleted file mode 100644
index 847c86cd..00000000
--- a/src/caffe/layers/cufiles/log_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/neuron_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_gpu_log(count, bottom_data, top_data);
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_gpu_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, base_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-    const int count = bottom[0]->count();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, bottom_data, bottom_diff);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, bottom_diff);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-    }
-    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-    if (backward_num_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(LogLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu
deleted file mode 100644
index 001b3c34..00000000
--- a/src/caffe/layers/cufiles/lrn_layer.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const in_off = in + offset;
-    Dtype* const scale_off = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
-}
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_gpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-// TODO: check if it would be faster to just put it into the previous kernel.
-template <typename Dtype>
-__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  CUDA_POST_KERNEL_CHECK;
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, scale_data, -beta_, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void LRNLayer<float>::CrossChannelForward_gpu(
-    const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top);
-template void LRNLayer<double>::CrossChannelForward_gpu(
-    const vector<Blob<double>*>& bottom, const vector<Blob<double>*>& top);
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_gpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-template <typename Dtype>
-__global__ void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const bottom_off = bottom_data + offset;
-    const Dtype* const top_off = top_data + offset;
-    const Dtype* const scale_off = scale + offset;
-    const Dtype* const top_diff_off = top_diff + offset;
-    Dtype* const bottom_diff_off = bottom_diff + offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
-}
-template void LRNLayer<float>::CrossChannelBackward_gpu(
-    const vector<Blob<float>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<float>*>& bottom);
-template void LRNLayer<double>::CrossChannelBackward_gpu(
-    const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<double>*>& bottom);
-
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu
deleted file mode 100644
index 3888a0c7..00000000
--- a/src/caffe/layers/cufiles/mvn_layer.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-        sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E(X^2)
-    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-        temp_.mutable_gpu_data());  // (EX)^2
-    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-        variance_.mutable_gpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-
-    // normalize variance
-    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-          variance_.mutable_gpu_data());
-
-    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-  }
-}
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          bottom_diff);
-    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-            bottom_diff);
-
-    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-  } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu
deleted file mode 100644
index ca4b13f7..00000000
--- a/src/caffe/layers/cufiles/pooling_layer.cu
+++ /dev/null
@@ -1,387 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxPoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data, int* mask, Dtype* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + kernel_h, height + pad_h);
-    int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (hend - hstart) * (wend - wstart);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    Dtype aveval = 0;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = aveval / pool_size;
-  }
-}
-
-template <typename Dtype>
-__global__ void StoPoolForwardTrain(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const rand_idx, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    Dtype cumsum = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_slice[h * width + w];
-          return;
-        }
-      }
-    }
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolForwardTest(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
-    Dtype cumvalues = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;
-  Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_gpu_data();
-    } else {
-      mask = max_idx_.mutable_gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    if (this->phase_ == TRAIN) {
-      // We need to create the random index as well.
-      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-                            rand_idx_.mutable_gpu_data());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                   CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top_data);
-    } else {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                  CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-template <typename Dtype>
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int* const mask, const Dtype* const top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice = top_diff + offset;
-    if (mask) {
-      const int* const mask_slice = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      const Dtype* const top_mask_slice = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int hstart = ph * stride_h - pad_h;
-        int wstart = pw * stride_w - pad_w;
-        int hend = min(hstart + kernel_h, height + pad_h);
-        int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolBackward(const int nthreads,
-    const Dtype* const rand_idx, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const rand_idx_slice =
-        rand_idx + (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        gradient += top_diff_slice[ph * pooled_width + pw] *
-            (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int count = bottom[0]->count();
-  caffe_gpu_set(count, Dtype(0.), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->gpu_data();
-    } else {
-      mask = max_idx_.gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    StoPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-        bottom_diff);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu
deleted file mode 100644
index 90d94405..00000000
--- a/src/caffe/layers/cufiles/power_layer.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_gpu_set(count, value, top_data);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
-}
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_gpu_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = bottom[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu
deleted file mode 100644
index e1f20048..00000000
--- a/src/caffe/layers/cufiles/prelu_layer.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// CUDA kernele for forward
-template <typename Dtype>
-__global__ void PReLUForward(const int n, const int channels, const int dim,
-    const Dtype* in, Dtype* out, const Dtype* slope_data,
-    const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-  }
-}
-
-// CUDA kernel for bottom backward
-template <typename Dtype>
-__global__ void PReLUBackward(const int n, const int channels, const int dim,
-    const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff,
-    const Dtype* slope_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * slope_data[c]);
-  }
-}
-
-// CUDA kernel for element-wise parameter backward
-template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-  }
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  PReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, channels, dim, bottom_data, top_data, slope_data, div_factor);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-    Dtype dsum = 0.;
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-          CAFFE_CUDA_NUM_THREADS>>>(
-          cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n),
-          backward_buff_.mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-      if (channel_shared_) {
-        Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        dsum += d;
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-            slope_diff);
-      }
-    }
-    if (channel_shared_) {
-      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-        count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
-        div_factor);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu
deleted file mode 100644
index 2dbd3bc9..00000000
--- a/src/caffe/layers/cufiles/reduction_layer.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.gpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_gpu_data();
-    caffe_gpu_scal(num_, coeff_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu
deleted file mode 100644
index b8924c85..00000000
--- a/src/caffe/layers/cufiles/relu_layer.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out,
-    Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data, negative_slope);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void ReLUBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * negative_slope);
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff, negative_slope);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
deleted file mode 100644
index 547fa80c..00000000
--- a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = bottom[1]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu
deleted file mode 100644
index e1af0657..00000000
--- a/src/caffe/layers/cufiles/sigmoid_layer.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SigmoidForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void SigmoidBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const Dtype sigmoid_x = out_data[index];
-    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu
deleted file mode 100644
index 8d044ee7..00000000
--- a/src/caffe/layers/cufiles/silence_layer.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Do nothing.
-}
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_data());
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu
deleted file mode 100644
index 796841d3..00000000
--- a/src/caffe/layers/cufiles/slice_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Slice(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_slices, const int slice_size,
-    const int bottom_slice_axis, const int top_slice_axis,
-    const int offset_slice_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_slice_size = slice_size * top_slice_axis;
-    const int slice_num = index / total_slice_size;
-    const int slice_index = index % total_slice_size;
-    const int bottom_index = slice_index +
-        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
-    if (forward) {
-      out_data[index] = in_data[bottom_index];
-    } else {
-      out_data[bottom_index] = in_data[index];
-    }
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int offset_slice_axis = 0;
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < top.size(); ++i) {
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  int offset_slice_axis = 0;
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu
deleted file mode 100644
index 1f9c3a41..00000000
--- a/src/caffe/layers/cufiles/softmax_layer.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype dot = 0;
-    for (int c = 0; c < channels; ++c) {
-      dot += (data_1[(n * channels + c) * spatial_dim + s]
-          * data_2[(n * channels + c) * spatial_dim + s]);
-    }
-    channel_dot[index] = dot;
-  }
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = bottom[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  // compute max
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // subtract
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-  // exponentiate
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, top_data, top_data);
-  // sum after exp
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // divide
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = top[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, top_diff, bottom_diff);
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_,
-      top_diff, top_data, scale_data);
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, bottom_diff);
-  // elementwise multiplication
-  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu
deleted file mode 100644
index 7e0f3da4..00000000
--- a/src/caffe/layers/cufiles/softmax_loss_layer.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      loss[index] = 0;
-      counts[index] = 0;
-    } else {
-      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      Dtype(FLT_MIN)));
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.gpu_data();
-  const Dtype* label = bottom[1]->gpu_data();
-  const int dim = prob_.count() / outer_num_;
-  const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-  // Similarly, this memory is never used elsewhere, and thus we can use it
-  // to avoid having to allocate additional GPU memory.
-  Dtype* counts = prob_.mutable_gpu_diff();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Dtype loss;
-  caffe_gpu_asum(nthreads, loss_data, &loss);
-  if (normalize_) {
-    Dtype count;
-    caffe_gpu_asum(nthreads, counts, &count);
-    loss /= count;
-  } else {
-    loss /= outer_num_;
-  }
-  top[0]->mutable_cpu_data()[0] = loss;
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
-}
-
-template <typename Dtype>
-__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts) {
-  const int channels = dim / spatial_dim;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      for (int c = 0; c < channels; ++c) {
-        bottom_diff[n * dim + c * spatial_dim + s] = 0;
-      }
-      counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* prob_data = prob_.gpu_data();
-    const Dtype* top_data = top[0]->gpu_data();
-    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->gpu_data();
-    const int dim = prob_.count() / outer_num_;
-    const int nthreads = outer_num_ * inner_num_;
-    // Since this memory is never used for anything else,
-    // we use to to avoid allocating new GPU memory.
-    Dtype* counts = prob_.mutable_gpu_diff();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
-        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      Dtype count;
-      caffe_gpu_asum(nthreads, counts, &count);
-      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu
deleted file mode 100644
index a4f5df26..00000000
--- a/src/caffe/layers/cufiles/split_layer.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
-}
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-    return;
-  }
-  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-                bottom[0]->mutable_gpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu
deleted file mode 100644
index ccd6e63e..00000000
--- a/src/caffe/layers/cufiles/tanh_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// TanH neuron activation function layer.
-// Adapted from ReLU layer code written by Yangqing Jia
-
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = tanh(in[index]);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  TanHForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void TanHBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype tanhx = out_data[index];
-    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    TanHBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu
deleted file mode 100644
index bfa7f159..00000000
--- a/src/caffe/layers/cufiles/threshold_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ThresholdForward(const int n, const Dtype threshold,
-    const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > threshold ? 1 : 0;
-  }
-}
-
-template <typename Dtype>
-void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ThresholdForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, threshold_, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer);
-
-
-}  // namespace caffe

From b6b96a7471e7b9d1db132044c421bf1452e4d314 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 2 Sep 2015 06:12:20 +0800
Subject: [PATCH 046/124] Removed forward_opt and backward_opt functions in
 conv layer

---
 include/caffe/vision_layers.hpp      |   8 --
 src/caffe/layers/base_conv_layer.cpp | 137 ---------------------------
 src/caffe/layers/conv_layer.cpp      |  13 ---
 3 files changed, 158 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 2f2d7eef..3ee5a779 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -140,10 +140,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 
 //opencl related data structures
 protected:
-  void forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, 
-      const vector<Blob<Dtype>*>& top,  bool skip_im2col = false) ;
-  void backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   int opt_num2;
   int M_, N_, K_;
   int weight_offset_;
@@ -223,12 +219,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 19458185..fc541ef9 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -448,143 +448,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
           bias, (size_t)0, 1);
 }
 
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, const vector<Blob<Dtype>*>& top, bool skip_im2col){
-
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-  int M_org = M_ * group_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  int weight_offset = M_ * K_;
-  int opt_num2 = global_packing_N;
-  cl_command_queue Queue;
-  cl_event prof_event;
-  for (int n = 0; n < num_; n += opt_num2) {
-    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    top_offset = M_ * N_ * opt_num2;
-    col_offset = K_ * N_ * opt_num2;
-    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
-
-#ifdef multiQ
-    for (int g = 0; g < group_; ++g) {
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
-       }
-     if(group_ == 2){
-       clFinish(amdDevice.CommandQueue);
-       clFinish(amdDevice.CommandQueue_helper);
-     }
-#else
-    Queue = amdDevice.CommandQueue;
-    for (int g = 0; g < group_; ++g) {
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
-       }
-#endif
-    transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
-
-   for (int z = 0; z < opt_num2; z++)
-      if (bias_term_) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z);
-    }
-  }
-}
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
-    for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
-          (Dtype)1., top_diff, top[i]->offset(n), N_,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
-          bias_diff, (size_t)0, 1);
-     }
-   }
-
- if (this->param_propagate_down_[0] || propagate_down[i]) {
-  const Dtype* bottom_data = bottom[i]->gpu_data();
-  Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  int weight_offset = M_ * K_;
-  int opt_num2 = global_packing_N;
-  int g = 0;
-  cl_command_queue Queue;
-  cl_event prof_event;
-
-  for (int n = 0; n < num_; n += opt_num2) {
-    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    top_offset = M_ * (N_ * opt_num2);
-    col_offset = K_ * (N_ * opt_num2);
-    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
-
-    int height_top = M_ * group_, width_top = N_;
-    opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
-
-    for(g = 0; g < group_; ++g) {
-#ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-#else
-       Queue =  amdDevice.CommandQueue;
-#endif
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
-        (Dtype)1., (Dtype*)subTopMem, top_offset * g,
-        (Dtype*)transMem, col_offset * g, (Dtype)1.,
-        (Dtype*)weight_diff, weight_offset * g);
-    }
-
-   if (propagate_down[i]) {
-      for (g = 0; g < group_; ++g) {
-#ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-#else
-       Queue =  amdDevice.CommandQueue;
-#endif
-       prof_event =  caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_,
-          (Dtype)1., weight,  weight_offset * g,
-          (Dtype*)subTopMem, top_offset * g,
-          (Dtype)0., (Dtype*)transMem, col_offset * g);
-      }
-    }
-
-#ifdef multiQ
-   if(group_ ==2){
-      clFinish(amdDevice.CommandQueue);
-      clFinish(amdDevice.CommandQueue_helper);
-    }
-#endif
-       col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-                  stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
-   }
-  }
- }
-}
-
 #endif  // !CPU_ONLY
 
 INSTANTIATE_CLASS(BaseConvolutionLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 020098aa..c829dbd7 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -87,13 +87,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       Backward_gpu_org(top, propagate_down, bottom);
 }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  this->forward_gpu_opt(bottom, weight, top);
-}
-
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -153,12 +146,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
   //CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-      this->backward_gpu_opt(top, propagate_down, bottom);
-}
-
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

From 20142c4a4bf5f74c37a5253468f2b8de33b4e5d0 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 4 Sep 2015 02:11:04 +0800
Subject: [PATCH 047/124] Enable SetDevice function; clean the code in
 device.cpp

---
 include/caffe/common.hpp        |   1 -
 include/caffe/device.hpp        |  10 +-
 src/caffe/common.cpp            | 119 +---------------
 src/caffe/device.cpp            | 242 +++++++++-----------------------
 src/caffe/layers/conv_layer.cpp |  29 ++--
 5 files changed, 95 insertions(+), 306 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index b93e0d6d..ac954a0e 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -233,7 +233,6 @@ class Caffe {
   // it personally but better to note it here in the header file.
   inline static void set_mode(Brew mode) { 
     Get().mode_ = mode;
-    amdDevice.Init();
   }
   // Sets the random seed of both boost and curand
   static void set_random_seed(const unsigned int seed);
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 31adcb5f..697e2391 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -34,7 +34,7 @@ namespace caffe {
 
 class Device{
 public:
-    Device():numPlatforms(0),numDevices(0){ }
+    Device():numPlatforms(0),numDevices(0),device_id(INT_MIN){}
     ~Device();
     cl_uint numPlatforms;
     cl_platform_id * platformIDs;
@@ -42,22 +42,26 @@ class Device{
     char openclVersion[64];
     cl_uint numDevices;
     cl_device_id * DeviceIDs;
+   
     cl_context Context;
     cl_command_queue CommandQueue;
     cl_command_queue CommandQueue_helper;
     cl_program Program; 
     cl_device_id * pDevices;
+    int device_id;
+
     clblasOrder col;
     clblasOrder row;
     std::map<std::string, cl_kernel> Kernels;    
-     
-    cl_int Init(); 
+         
+    cl_int Init(int device_id = -1); 
     cl_int ConvertToString(std::string pFileName,std::string &Str);
     void DisplayPlatformInfo();
     void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
 
     void GetDeviceInfo();
     void DeviceQuery();    
+    int GetDevice(){return device_id;};
     void BuildProgram(std::string kernel_dir);    
 
     template <typename T>
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 3891852a..83afe272 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -108,10 +108,12 @@ Caffe::Caffe()
     LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
   }
 */
+#ifndef CPU_ONLY
    cl_int err =  clblasSetup();
    if(err != CL_SUCCESS){
        LOG(ERROR) << "clBLAS setup failed "<<err;
    }
+#endif
 }
 
 Caffe::~Caffe() {
@@ -120,7 +122,9 @@ Caffe::~Caffe() {
     CURAND_CHECK(curandDestroyGenerator(curand_generator_));
   }
 */
+#ifndef CPU_ONLY
    clblasTeardown();
+#endif
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
@@ -143,60 +147,13 @@ void Caffe::set_random_seed(const unsigned int seed) {
 }
 
 void Caffe::SetDevice(const int device_id) {
- /* int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
+  if (amdDevice.GetDevice() == device_id) {
     return;
   }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
-  }
-  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
-  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
-      CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
-      cluster_seedgen()));
-*/
+  amdDevice.Init(device_id);
 }
 
 void Caffe::DeviceQuery() {
-  /*cudaDeviceProp prop;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)) {
-    printf("No cuda device present.\n");
-    return;
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-  LOG(INFO) << "Device id:                     " << device;
-  LOG(INFO) << "Major revision number:         " << prop.major;
-  LOG(INFO) << "Minor revision number:         " << prop.minor;
-  LOG(INFO) << "Name:                          " << prop.name;
-  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
-  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
-  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
-  LOG(INFO) << "Warp size:                     " << prop.warpSize;
-  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
-  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
-  LOG(INFO) << "Maximum dimension of block:    "
-      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-      << prop.maxThreadsDim[2];
-  LOG(INFO) << "Maximum dimension of grid:     "
-      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-      << prop.maxGridSize[2];
-  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
-  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
-  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
-  LOG(INFO) << "Concurrent copy and execution: "
-      << (prop.deviceOverlap ? "Yes" : "No");
-  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
-  LOG(INFO) << "Kernel execution timeout:      "
-      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-  return;
-*/
   amdDevice.DeviceQuery();
 }
 
@@ -223,70 +180,6 @@ void* Caffe::RNG::generator() {
   return static_cast<void*>(generator_->rng());
 }
 
-//const char* cublasGetErrorString(cublasStatus_t error) {
- /* switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#endif
-#if CUDA_VERSION >= 6050
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
-  }
-*/
-//  return "Unknown cublas status";
-//}
-
-//const char* curandGetErrorString(curandStatus_t error) {
-  /*switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-*/
- // return "Unknown curand status";
-//}
-
 #endif  // CPU_ONLY
 
 }  // namespace caffe
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 7a866c11..dc47e907 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -37,25 +37,25 @@ namespace caffe {
 cl_mem test_alloc_mem[10];
 extern long long unsigned device_mem_consumption;
 
-Device amdDevice;
 char* buildOption = "-x clc++ ";
 //char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64";
-std::string oclKernelPath="./src/caffe/ocl/";
+std::string oclKernelPath = "./src/caffe/ocl/";
+Device amdDevice;
 
 Device::~Device(){
     //clAmdBlasTeardown(); 
     ReleaseKernels(); 
     free((void*)platformIDs);
-     free(DeviceIDs);
-     clReleaseProgram(Program);
-     clReleaseCommandQueue(CommandQueue);
-     clReleaseCommandQueue(CommandQueue_helper);
-     clReleaseContext(Context);
-     LOG(INFO) << "device destructor";
+    free(DeviceIDs);
+    clReleaseProgram(Program);
+    clReleaseCommandQueue(CommandQueue);
+    clReleaseCommandQueue(CommandQueue_helper);
+    clReleaseContext(Context);
+    LOG(INFO) << "device destructor";
 }
 
 
-cl_int Device::Init(){
+cl_int Device::Init(int deviceId){
 
     //Get Platform Infomation
     DisplayPlatformInfo();
@@ -75,63 +75,36 @@ cl_int Device::Init(){
     GetDeviceInfo();
     cl_uint uiNumDevices;
     cl_bool unified_memory = false;
-/*    switch(Caffe::mode()) {
-    case Caffe::GPU:
-         //choose_gpu();
-      clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-      uiNumDevices = numDevices;
-      if(0 == uiNumDevices){
+    clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+    uiNumDevices = numDevices;
+    if(0 == uiNumDevices){
         LOG(FATAL) << "Err: No GPU devices";
-       }
-       else{
+    } else {
         pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
         OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
-        for (int i = 0; i < (int)uiNumDevices; i++){
-          clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
-          if(unified_memory) //skip iGPU
-            continue;
-          else {//we pick the first GPU we found
-           pDevices[0] = pDevices[i];
+        if (deviceId == -1) { 
+            int i;
+	    for (i = 0; i < (int)uiNumDevices; i++){
+                clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
+                if(!unified_memory) { //skip iGPU
+                    //we pick the first dGPU we found
+                    pDevices[0] = pDevices[i];
+                    device_id = i;
+                    LOG(INFO) << "Picked default device type : dGPU "<<device_id;
+                    break;
+                }
+            }
+	    if (i == uiNumDevices) {
+                LOG(FATAL) << "Cannot find any dGPU! ";
             }
-         }
-       }
-         LOG(INFO) << "picked device type: GPU";
-         break;
-    case Caffe::CPU:
-         //choose_cpu();
-         clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);
-         uiNumDevices = numDevices;
-        if(0 == uiNumDevices){
-          LOG(FATAL) << "Err: No CPU devices";
-          }
-         pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
-         OCL_CHECK( clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_CPU, 1, pDevices, NULL) );
-         LOG(INFO) << "picked device type: CPU";
-         break;
-*/  
-//  case Caffe::APU:
-        clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-        uiNumDevices = numDevices;
-        if(0 == uiNumDevices){
-          LOG(FATAL) << "Err: No GPU devices";
-         }
-         else{
-          pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
-          OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
-          for (int i = 0; i < (int)uiNumDevices; i++){
-            clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
-             if(unified_memory) //we pick the first GPU we found
-              pDevices[0] = pDevices[i];
-             else {//skip dGPU
-               continue;
-               }
-         }
-       }
-         LOG(INFO) << "picked device type: APU";
-  //       break;
-  //  default:
-  //       LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  //  }
+        } else if (deviceId >=0 && deviceId < uiNumDevices){
+            pDevices[0] = pDevices[deviceId];
+            device_id = deviceId;
+            LOG(INFO) << "Picked device type : GPU "<<device_id;
+        } else {
+            LOG(FATAL) << "  Invalid GPU deviceId! ";
+        }
+   }
 
     //Create Context
     Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
@@ -139,7 +112,6 @@ cl_int Device::Init(){
         fprintf(stderr,"Err: Failed to Create Context\n");
         return 0;
     }
-
     //Create CommandQueue
     CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
     CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
@@ -147,58 +119,15 @@ cl_int Device::Init(){
         fprintf(stderr,"Err: Failed to Create Commandqueue\n");
         return 0;
     }
-   
-  
     //BuildProgram from OpenCL kernel files
     BuildProgram(oclKernelPath);
-
     row = clblasRowMajor;
     col = clblasColumnMajor;
-	/* 
-	//delete after test the large buffer allocation, Yibing	
-	long long global_mem_size_limit = 1024*1024; //4*1024*1024*1024;
-	global_mem_size_limit *= (long long)(0.0*1024.0);
-	//global_mem_size_limit = 16834887680/2;
-	long long available_global_mem_size = 1024*1024;
-        available_global_mem_size *= 20*1024; 
-	
-	long long global_mem_malloc_size_limit = 1024*1024;
-	while(available_global_mem_size > global_mem_size_limit){
-		long long size_;
-		if((available_global_mem_size - global_mem_size_limit) >= global_mem_malloc_size_limit){
-			size_ = global_mem_malloc_size_limit;
-		}else{
-			size_ = available_global_mem_size - global_mem_size_limit;
-		}
-		available_global_mem_size = available_global_mem_size - size_;
-		int *tmpData = (int *)malloc(size_);
-		cl_int err;
-                int i = 0;
-		test_alloc_mem[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, size_, NULL, &err);
-        	err = clEnqueueWriteBuffer(CommandQueue, test_alloc_mem[i], CL_TRUE, 0, size_, tmpData, 0, NULL, NULL);
-		i++;
-                device_mem_consumption += size_;
-                //printf("self alloc, device_mem_consumption = %lu\n", device_mem_consumption);
-		if(err != CL_SUCCESS) {
-                	printf("Large Buffer Allocation  failed! error_code = %d\n", err);
-                	printf("self alloc, device_mem_consumption = %llu\n", device_mem_consumption);
-                	exit(1);
-        	}
-                
-		cl_ulong free_mem_size, mem_size;
-                cl_int err1 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_FREE_MEMORY_AMD,sizeof(cl_ulong),&free_mem_size,NULL);
-                cl_int err2 = clGetDeviceInfo(pDevices[0],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&mem_size,NULL);
-                //std::cout<<"free memory size after allocation = "<<free_mem_size<<",err_code ="<<err1<<std::endl;
-                //std::cout<<"global memory size = "<<mem_size<<",err_code ="<<err2<<std::endl;
-        	
-		free(tmpData);
-	}*/
-
     return 0;
 }
 
 void Device::BuildProgram(std::string kernel_dir)
-{  
+{ 
   //Access opencl kernel files
     std::string strSource = "";
     DIR *ocl_dir;
@@ -221,7 +150,6 @@ void Device::BuildProgram(std::string kernel_dir)
         ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
         strSource += tmpSource;
     }
-
     const char *pSource;
     pSource = strSource.c_str();
     size_t uiArrSourceSize[] = {0};
@@ -231,7 +159,6 @@ void Device::BuildProgram(std::string kernel_dir)
     if(NULL == Program){
         fprintf(stderr,"Err: Failed to create program\n");
     }
-
     //Build Program
     cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
     LOG(INFO) << "Build Program";
@@ -274,40 +201,10 @@ cl_int Device::ConvertToString(std::string pFileName,std::string &Str){
     return -1;
 }
 
-/*
-cl_program Device::BuildProgram(std::string pFileName)
-{
-      //Read our own kernel file
-    const char *pSource;
-    std::string strSource = "";
-    ConvertToString(pFileName, strSource);
-    pSource = strSource.c_str();
-    size_t uiArrSourceSize[] = {0};
-    uiArrSourceSize[0] = strlen(pSource);
-    cl_program program = NULL;
-    program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
-    if(NULL == program){
-        fprintf(stderr,"Err: Failed to create program\n");
-    }
-
-    //Build Program
-    cl_int iStatus = clBuildProgram(program, 1, pDevices, buildOption, NULL, NULL);
-    LOG(INFO) << "Build Program";
-    if(CL_SUCCESS != iStatus){
-        fprintf(stderr,"Err: Failed to build program\n");
-        char szBuildLog[16384];
-        clGetProgramBuildInfo(program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
-        std::cout << szBuildLog;
-        clReleaseProgram(program);
-        return NULL;
-    }
-  return program;
-}
-*/
 cl_kernel Device::GetKernel(std::string kernel_name)
 {
     std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
-    if(it == Kernels.end())
+    if (it == Kernels.end())
     {
         cl_int _err=0;
         cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err);
@@ -320,7 +217,7 @@ cl_kernel Device::GetKernel(std::string kernel_name)
 void Device::ReleaseKernels()
 {
     std::map<std::string, cl_kernel>::iterator it;
-    for(it = Kernels.begin(); it != Kernels.end(); it++)
+    for (it = Kernels.begin(); it != Kernels.end(); it++)
     {
         clReleaseKernel(it->second);
     }
@@ -331,7 +228,7 @@ void Device::DisplayPlatformInfo(){
    size_t size;
 
    err = clGetPlatformIDs (0, NULL, &numPlatforms);
-   if(err != CL_SUCCESS || numPlatforms <=0)
+   if (err != CL_SUCCESS || numPlatforms <=0)
    {
       LOG(ERROR) << "Failed to find any OpenCL platform.";
       return;
@@ -349,11 +246,11 @@ void Device::DisplayPlatformInfo(){
 
   //iterate through the list of platforms displaying platform information
   for (cl_uint i = 0; i < numPlatforms; i++ ){
-  DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
-  DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
-  DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
-  DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
-  DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS");
+      DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
+      DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
+      DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
+      DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
+      DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS");
   }
    
 }
@@ -388,37 +285,37 @@ void Device::GetDeviceInfo(){
     // we allow program run if no GPU is found. Just return. No error reported.
     if (numDevices < 1)
     {
-      LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
-      LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
-      return;
+        LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
+        LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
+        return;
     }
     
     DeviceIDs = (cl_device_id *) malloc (sizeof(cl_device_id) * numDevices);
     err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, DeviceIDs, NULL);
-    if(err != CL_SUCCESS)
+    if (err != CL_SUCCESS)
     {
-      LOG(INFO) << "Failed to find any GPU devices.";
-      return;
+        LOG(INFO) << "Failed to find any GPU devices.";
+        return;
     }
 
     LOG(INFO) << "Number of devices found:" << numDevices;
-    for(cl_uint i = 0; i < numDevices; i++){
-    LOG(INFO) << "\t" << "DeviceID" << ":\t" <<DeviceIDs[i];
-    DisplayDeviceInfo<cl_device_type>(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
-    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
-    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
-    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
-    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
-    DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
-    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
-    DisplayDeviceInfo<size_t>(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
-    DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
-    DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes");
-    DisplayDeviceInfo<cl_command_queue_properties>(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
-    DisplayDeviceInfo<cl_device_exec_capabilities>(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
-    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
-    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
-    DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+    for (cl_uint i = 0; i < numDevices; i++) {
+        LOG(INFO) << "\t" << "DeviceID" << ":\t" <<DeviceIDs[i];
+        DisplayDeviceInfo<cl_device_type>(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+        DisplayDeviceInfo<size_t>(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+        DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes");
+        DisplayDeviceInfo<cl_command_queue_properties>(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+        DisplayDeviceInfo<cl_device_exec_capabilities>(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
     }
     
     
@@ -435,7 +332,7 @@ void Device::DeviceQuery()
 
     size_t nameLen;
     cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
-    if(res != CL_SUCCESS){
+    if (res != CL_SUCCESS) {
         fprintf(stderr, "Err: Failed to Get Platform Info\n", res);
         return;
     }
@@ -466,8 +363,7 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string
    }
 
 
-   switch(name)
-{
+   switch(name){
     case CL_DEVICE_TYPE:
     {
         std::string deviceType;
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 8f7d8f82..4a85dd74 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -68,19 +68,19 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     }
   }
   //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
-  //CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  //CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff");
-
+ // CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+//  CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff[0]");
+ // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt(bottom, top);
+      Forward_gpu_opt(bottom, top);
   else
-   Forward_gpu_org(bottom, top);
-// CHECK_BLOB_DATA(top[0],20, "top[0]");
+      Forward_gpu_org(bottom, top);
+ // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -90,6 +90,12 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       Backward_gpu_opt(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
+//  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
+ // CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
+//  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
+ // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
+
+
 }
 
 template <typename Dtype>
@@ -131,9 +137,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
    }
   }
 
-  CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-  CHECK_BLOB_DATA(top[0],20, "top[0]");
-
 }
 
 template <typename Dtype>
@@ -159,8 +162,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
     }
   }
 
-  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
- // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -212,12 +213,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-
-  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
-  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
-  CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
+
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

From b804a1d9313c7c66c5b9955f561858a717308fea Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 14:50:21 +0800
Subject: [PATCH 048/124] Fixed conv layers opt2 bug

---
 include/caffe/common.hpp             |   2 +-
 include/caffe/vision_layers.hpp      |   8 ++-
 src/caffe/layers/base_conv_layer.cpp | 101 ++++++++-------------------
 src/caffe/layers/conv_layer.cpp      |  43 ++++--------
 4 files changed, 49 insertions(+), 105 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index ac954a0e..c5bf909d 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -81,7 +81,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1
+#define use_packing_scheme 1 
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 237e9cbf..2f2d7eef 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -105,6 +105,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
     col2im_gpu(col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
   }
+ protected:
   inline void conv_im2col_gpu_opt(const Dtype* data) {
      im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
            kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
@@ -113,11 +114,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
     col2im_gpu_opt((Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
         kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 }
+ private:
   inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
-    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_n, N_, M_*opt_num2, opt_num2);
+    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2);
 }
  inline void conv_transpose_gpu(const Dtype* data){
-    opttrans(data, top_offset_n, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
+    opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 }
 protected:
   inline void gpu_memset(Dtype* data, Dtype value, int count) {
@@ -147,7 +149,7 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   int weight_offset_;
   int col_offset_;
   int output_offset_;
-  int top_offset_, top_offset_n, bottom_offset_;
+  int top_offset_, top_offset_opt, bottom_offset_;
 public:
   static cl_mem subTopMem, transMem;
   static size_t subtop_mem_size, trans_mem_size;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 1c1379b3..faa7b63c 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -33,19 +33,9 @@ void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-/*  im2col_gpu_kernel = clCreateKernel(amdDevice.Program,"im2col_gpu_float_kernel", NULL);
-  col2im_gpu_kernel = clCreateKernel(amdDevice.Program,"col2im_gpu_float_kernel", NULL);
-  oclmem_kernel = clCreateKernel(amdDevice.Program, "oclmemfloat", NULL);
-  im2col_opt_kernel = clCreateKernel(amdDevice.Program, "im2col_optfloat", NULL);
-  col2im_opt_kernel = clCreateKernel(amdDevice.Program, "col2im_optfloat", NULL);
-  opttrans_kernel = clCreateKernel(amdDevice.Program, "opttransfloat", NULL);
-  ocl_Kernel_transpose = clCreateKernel(amdDevice.Program,"transposefloat",NULL);
-  ocl_Kernel_transform = clCreateKernel(amdDevice.Program,"transformfloat",NULL);
-*/
-  M_ = conv_out_channels_ / group_;
-  K_ = kernel_dim_ / group_;
-  N_ =  conv_out_spatial_dim_;
-
+  M_ = num_output_ / group_;
+  K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
+  N_ = height_out_ * width_out_;
 #ifdef use_packing_scheme
   size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
   size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
@@ -56,15 +46,6 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 
 template <typename Dtype>
  BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
- /*
-  OCL_CHECK( clReleaseKernel(im2col_gpu_kernel) );
-  OCL_CHECK( clReleaseKernel(col2im_gpu_kernel) );
-  OCL_CHECK( clReleaseKernel(oclmem_kernel) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_transpose) );
-  OCL_CHECK( clReleaseKernel(ocl_Kernel_transform) );
-  OCL_CHECK( clReleaseKernel(im2col_opt_kernel) );
-  OCL_CHECK( clReleaseKernel(col2im_opt_kernel) );
-*/
 }
 
 
@@ -314,9 +295,10 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      conv_im2col_gpu_opt(input);
+      //conv_im2col_gpu_opt(input);
+      im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
+                 (Dtype*)transMem, 0, opt_num2);
     }   
-    //col_buff = col_buffer_.gpu_data();
   }
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
@@ -324,7 +306,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
        else Queue =  amdDevice.CommandQueue_helper;
        prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
      if(group_ == 2){
        clFinish(amdDevice.CommandQueue);
@@ -335,10 +317,11 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     for (int g = 0; g < group_; ++g) {
        prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_ * g);
+          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
 #endif
-   conv_transform_gpu((Dtype*)subTopMem, output);
+   //conv_transform_gpu((Dtype*)subTopMem, output);
+   transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2);
 }
 
 
@@ -358,7 +341,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
       caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
           N_, 1, (Dtype)1., bias, 0,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., output, top_offset_n + num_output_ * N_ * z);
+          (Dtype)1., output, top_offset_ + num_output_ * N_ * z);
 }
 
 template <typename Dtype>
@@ -371,7 +354,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
   for (int g = 0; g < group_; ++g) {
         caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
           (Dtype)1., weights,  weight_offset_ * g,
-          output, top_offset_+output_offset_ * g,
+          output, top_offset_ + output_offset_ * g,
           (Dtype)0., col_buff, col_offset_ * g);
   }
   if (!is_1x1_) {
@@ -382,7 +365,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
     const Dtype* weights, Dtype* input) {
-  //Dtype* col_buff = col_buffer_.mutable_gpu_data();
   cl_command_queue Queue;
   if (is_1x1_) {
     int count = height_ * width_ * conv_in_channels_ * opt_num2;
@@ -395,9 +377,9 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2, conv_out_channels_ / group_,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
           (Dtype)1., weights,  weight_offset_ * g,
-          (Dtype*)subTopMem, top_offset_ * g,
+          (Dtype*)subTopMem, top_offset_opt * g,
           (Dtype)0., (Dtype*)transMem, col_offset_ * g);
       }
 #ifdef multiQ
@@ -408,8 +390,10 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #endif
 
   if (!is_1x1_) {
-      conv_col2im_gpu_opt(input);
-  }
+      //conv_col2im_gpu_opt(input);
+      col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+                  stride_w_, input, bottom_offset_, opt_num2);
+   }
 }
 
 template <typename Dtype>
@@ -433,10 +417,14 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   const Dtype* col_buff = input;
   cl_command_queue Queue;
   if (!is_1x1_) {
-    conv_im2col_gpu_opt(input);
-    //col_buff = col_buffer_.gpu_data();
+    //conv_im2col_gpu_opt(input);
+   im2col_gpu_opt(input, bottom_offset_, channels_, height_,
+                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
-    conv_transpose_gpu(output);
+    //conv_transpose_gpu(output);
+    int height_top = M_ * group_, width_top = N_;
+    opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
+
 
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -445,8 +433,8 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 #else
        Queue =  amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_ * opt_num2,
-        (Dtype)1., (Dtype*)subTopMem, top_offset_ * g,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+        (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g,
         (Dtype*)transMem, col_offset_ * g, (Dtype)1.,
         (Dtype*)weights, weight_offset_ * g);
 #ifdef multiQ
@@ -461,10 +449,8 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
- /* caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.gpu_data(), 1., bias);*/
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_*width_out_,
-          (Dtype)1., input, top_offset_, height_out_*width_out_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 
+          (Dtype)1., input, top_offset_, N_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias, (size_t)0, 1);
 }
@@ -475,12 +461,9 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
 
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
-     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
     Dtype* top_data = top[i]->mutable_gpu_data();
 
   Dtype* col_data = col_buffer_.mutable_gpu_data();
-  /*in the packing schme, M, K stay the same. N multiplies by opt_num becomes much bigger N'. 
-   N' is the M in sgemm call.*/
   int M_org = M_ * group_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
@@ -488,19 +471,13 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
   int opt_num2 = global_packing_N;
   cl_command_queue Queue;
   cl_event prof_event;
-  //LOG(INFO) << "conv_fp optimized scheme";
   for (int n = 0; n < num_; n += opt_num2) {
     opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    /*col_offset is the offset for sgemm, including packing and groups
-    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
     top_offset = M_ * N_ * opt_num2;
     col_offset = K_ * N_ * opt_num2;
-    //step1: packed im2col, col_size = (K_ * group_ ) * N_
-    //this should be opt_num2 images packing together.
     im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
-    //step 2: sgemm: Top (subTopMem) = weight * col_data
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
        if(g == 0) Queue = amdDevice.CommandQueue;
@@ -521,10 +498,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
           (Dtype)0., (Dtype*)subTopMem, top_offset * g);
        }
 #endif
-    //step 3: tranform
     transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
-    //step 4: add bias
-    /*note: this sgemm has to use num_output_ instead of M, because M = M /group, in setup*/
 
    for (int z = 0; z < opt_num2; z++)
       if (bias_term_) {
@@ -551,7 +525,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
     for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, M_, N_,
+      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
           (Dtype)1., top_diff, top[i]->offset(n), N_,
           reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
           bias_diff, (size_t)0, 1);
@@ -570,25 +544,17 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
   int g = 0;
   cl_command_queue Queue;
   cl_event prof_event;
-  //LOG(INFO) << "conv_bp optimized scheme";
 
   for (int n = 0; n < num_; n += opt_num2) {
     opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    /*col_offset is the offset for sgemm, including packing and groups
-    for the last loop, may not be 16. for correctness, col_offset, weight_offset, top_offset will all be different*/
     top_offset = M_ * (N_ * opt_num2);
     col_offset = K_ * (N_ * opt_num2);
-    //step1: packed im2col, col_size = (K_ * group_ ) * N_
-    //this should be opt_num2 images packing together.
     im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
                        width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
 
-    //step 2: transform top[n] into shoulder by shoulder, right now i cheated by just copying the data over. without re-organize
     int height_top = M_ * group_, width_top = N_;
-    //if (opt_num2 >1)
     opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
 
-    //step 3: sgemm: Top (subTopMem) = weight * col_data
     for(g = 0; g < group_; ++g) {
 #ifdef multiQ
        if(g == 0) Queue = amdDevice.CommandQueue;
@@ -602,7 +568,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
         (Dtype*)weight_diff, weight_offset * g);
     }
 
-   //step4:
    if (propagate_down[i]) {
       for (g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -624,14 +589,8 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
       clFinish(amdDevice.CommandQueue_helper);
     }
 #endif
-
-    //step5: col2im
        col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
                   stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
-#ifdef Track_layer
-    LOG(WARNING) << "conv bp done";
-#endif
-
    }
   }
  }
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 4a85dd74..4f59260a 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -1,5 +1,4 @@
 #include <vector>
-
 #include "caffe/filler.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/util/im2col.hpp"
@@ -33,7 +32,7 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
 
-//  CHECK_BLOB_DATA(top[0],20, "top[0]");
+ // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
@@ -67,10 +66,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-  //CHECK_CPU_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
- // CHECK_CPU_MEM_DATA(bottom[0]->mutable_cpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-//  CHECK_CPU_MEM_DATA(top[0]->cpu_diff(), top[0]->count(), 20, "top_diff[0]");
- // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 template <typename Dtype>
@@ -79,8 +74,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   if (use_packing_scheme && global_packing_N >1)
       Forward_gpu_opt(bottom, top);
   else
-      Forward_gpu_org(bottom, top);
- // CHECK_BLOB_DATA(top[0],20, "top[0]");
+   Forward_gpu_org(bottom, top);
 }
 
 template <typename Dtype>
@@ -103,11 +97,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom
       const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   this->forward_gpu_opt(bottom, weight, top);
-
-#ifdef Track_layer
-  LOG(WARNING) << "conv fp done";
-#endif
-
 }
 
 template <typename Dtype>
@@ -120,14 +109,14 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
 
     Dtype* top_data = top[i]->mutable_gpu_data();
     this->opt_num2 = global_packing_N;
+    this->weight_offset_ = this->M_ * this->K_;
     for (int n = 0; n < this->num_; n += this->opt_num2) {
       this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
        //intermediate variables to pass offset
-      this->top_offset_ = this->M_ * this->N_ * this->opt_num2;
-      this->top_offset_n = top[i]->offset(n);
+      this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
+      this->top_offset_ = top[i]->offset(n);
       this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
       this->bottom_offset_ = bottom[i]->offset(n);
-      this->weight_offset_ = this->M_ * this->K_;
       this->forward_gpu_gemm_opt(bottom_data, weight,
             top_data);
       if (this->bias_term_) {
@@ -136,7 +125,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
       }
    }
   }
-
 }
 
 template <typename Dtype>
@@ -162,8 +150,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
     }
   }
 
-}
-
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
@@ -181,30 +167,31 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
     // Bias gradient, if necessary.
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      this->gpu_memset(bias_diff, 0., this->blobs_[1]->count());
+      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
       for (int n = 0; n < this->num_; ++n) {
-       //
         this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
         this->backward_gpu_bias(bias_diff, top_diff);
       }
-    }
+     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->gpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
       this->weight_offset_ = this->M_ * this->K_;
       this->opt_num2 = global_packing_N;
-      for (int n = 0; n < this->num_; ++n) {
+      for (int n = 0; n < this->num_; n += this->opt_num2) {
         this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
-        this->top_offset_n = top[i]->offset(n);
+        this->top_offset_ = top[i]->offset(n);
         this->bottom_offset_ = bottom[i]->offset(n);
         this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_ = this->M_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           this->weight_gpu_gemm_opt(bottom_data,
               top_diff, weight_diff);
         }
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
           this->backward_gpu_gemm_opt(top_diff, weight,
@@ -253,10 +240,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
     }
   }
   
-//  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");  
-//  CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-//  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
- // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
 }
 
 #ifdef CPU_ONLY

From 79e246a971eacf335b1ba08fc3b71af3244c01cb Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 15:29:44 +0800
Subject: [PATCH 049/124] conv clean up

---
 src/caffe/layers/base_conv_layer.cpp | 23 +++++++++--------------
 src/caffe/layers/conv_layer.cpp      |  5 +----
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index faa7b63c..6071c49b 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -295,9 +295,9 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
-      //conv_im2col_gpu_opt(input);
-      im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
-                 (Dtype*)transMem, 0, opt_num2);
+      conv_im2col_gpu_opt(input);
+     // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
+        //         (Dtype*)transMem, 0, opt_num2);
     }   
   }
 #ifdef multiQ
@@ -390,9 +390,9 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 #endif
 
   if (!is_1x1_) {
-      //conv_col2im_gpu_opt(input);
-      col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-                  stride_w_, input, bottom_offset_, opt_num2);
+      conv_col2im_gpu_opt(input);
+     // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
+       //           stride_w_, input, bottom_offset_, opt_num2);
    }
 }
 
@@ -414,12 +414,11 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
     const Dtype* output, Dtype* weights) {
-  const Dtype* col_buff = input;
   cl_command_queue Queue;
   if (!is_1x1_) {
-    //conv_im2col_gpu_opt(input);
-   im2col_gpu_opt(input, bottom_offset_, channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
+    conv_im2col_gpu_opt(input);
+   //im2col_gpu_opt(input, bottom_offset_, channels_, height_,
+     //                  width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
     //conv_transpose_gpu(output);
     int height_top = M_ * group_, width_top = N_;
@@ -462,8 +461,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bo
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
-
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
   int M_org = M_ * group_;
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
@@ -535,8 +532,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& t
  if (this->param_propagate_down_[0] || propagate_down[i]) {
   const Dtype* bottom_data = bottom[i]->gpu_data();
   Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-  Dtype* col_data = col_buffer_.mutable_gpu_data();
-  Dtype* col_diff = col_buffer_.mutable_gpu_diff();
   int col_offset = K_ * N_;
   int top_offset = M_ * N_;
   int weight_offset = M_ * K_;
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 4f59260a..0c3a1367 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -81,7 +81,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     if (use_packing_scheme && global_packing_N >1)
-      Backward_gpu_opt(top, propagate_down, bottom);
+      Backward_gpu_opt2(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
 //  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
@@ -189,9 +189,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
           this->weight_gpu_gemm_opt(bottom_data,
               top_diff, weight_diff);
         }
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
           this->backward_gpu_gemm_opt(top_diff, weight,

From 1958793cdb0513337a25b24e273bddf5acad33c2 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 1 Sep 2015 22:52:23 +0800
Subject: [PATCH 050/124] removed all cuDNN files

---
 src/caffe/layers/base_conv_layer.cpp          |   6 -
 src/caffe/layers/cudnn_conv_layer.cpp         | 130 ------
 src/caffe/layers/cudnn_pooling_layer.cpp      |  50 ---
 src/caffe/layers/cudnn_relu_layer.cpp         |  46 ---
 src/caffe/layers/cudnn_sigmoid_layer.cpp      |  46 ---
 src/caffe/layers/cudnn_softmax_layer.cpp      |  50 ---
 src/caffe/layers/cudnn_tanh_layer.cpp         |  46 ---
 src/caffe/layers/cufiles/absval_layer.cu      |  33 --
 src/caffe/layers/cufiles/base_data_layer.cu   |  30 --
 src/caffe/layers/cufiles/bnll_layer.cu        |  60 ---
 src/caffe/layers/cufiles/concat_layer.cu      |  71 ----
 .../layers/cufiles/contrastive_loss_layer.cu  | 111 -----
 src/caffe/layers/cufiles/conv_layer.cu        |  64 ---
 src/caffe/layers/cufiles/cudnn_conv_layer.cu  | 160 --------
 .../layers/cufiles/cudnn_pooling_layer.cu     |  45 --
 src/caffe/layers/cufiles/cudnn_relu_layer.cu  |  57 ---
 .../layers/cufiles/cudnn_sigmoid_layer.cu     |  47 ---
 .../layers/cufiles/cudnn_softmax_layer.cu     |  48 ---
 src/caffe/layers/cufiles/cudnn_tanh_layer.cu  |  48 ---
 src/caffe/layers/cufiles/deconv_layer.cu      |  64 ---
 src/caffe/layers/cufiles/dropout_layer.cu     |  77 ----
 src/caffe/layers/cufiles/eltwise_layer.cu     | 135 ------
 .../layers/cufiles/euclidean_loss_layer.cu    |  44 --
 src/caffe/layers/cufiles/exp_layer.cu         |  44 --
 src/caffe/layers/cufiles/filter_layer.cu      |  70 ----
 src/caffe/layers/cufiles/hdf5_data_layer.cu   |  53 ---
 src/caffe/layers/cufiles/hdf5_output_layer.cu |  43 --
 src/caffe/layers/cufiles/im2col_layer.cu      |  37 --
 .../layers/cufiles/inner_product_layer.cu     |  57 ---
 src/caffe/layers/cufiles/log_layer.cu         |  57 ---
 src/caffe/layers/cufiles/lrn_layer.cu         | 203 ---------
 src/caffe/layers/cufiles/mvn_layer.cu         | 124 ------
 src/caffe/layers/cufiles/pooling_layer.cu     | 387 ------------------
 src/caffe/layers/cufiles/power_layer.cu       |  87 ----
 src/caffe/layers/cufiles/prelu_layer.cu       | 128 ------
 src/caffe/layers/cufiles/reduction_layer.cu   |  93 -----
 src/caffe/layers/cufiles/relu_layer.cu        |  65 ---
 .../sigmoid_cross_entropy_loss_layer.cu       |  37 --
 src/caffe/layers/cufiles/sigmoid_layer.cu     |  62 ---
 src/caffe/layers/cufiles/silence_layer.cu     |  28 --
 src/caffe/layers/cufiles/slice_layer.cu       |  71 ----
 src/caffe/layers/cufiles/softmax_layer.cu     | 149 -------
 .../layers/cufiles/softmax_loss_layer.cu      | 125 ------
 src/caffe/layers/cufiles/split_layer.cu       |  38 --
 src/caffe/layers/cufiles/tanh_layer.cu        |  59 ---
 src/caffe/layers/cufiles/threshold_layer.cu   |  33 --
 46 files changed, 3518 deletions(-)
 delete mode 100644 src/caffe/layers/cudnn_conv_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_pooling_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_relu_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_sigmoid_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_softmax_layer.cpp
 delete mode 100644 src/caffe/layers/cudnn_tanh_layer.cpp
 delete mode 100644 src/caffe/layers/cufiles/absval_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/base_data_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/bnll_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/concat_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/contrastive_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/conv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_conv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_pooling_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_relu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_softmax_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/cudnn_tanh_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/deconv_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/dropout_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/eltwise_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/euclidean_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/exp_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/filter_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/hdf5_data_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/hdf5_output_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/im2col_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/inner_product_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/log_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/lrn_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/mvn_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/pooling_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/power_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/prelu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/reduction_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/relu_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/sigmoid_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/silence_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/slice_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/softmax_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/softmax_loss_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/split_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/tanh_layer.cu
 delete mode 100644 src/caffe/layers/cufiles/threshold_layer.cu

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 6071c49b..19458185 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -296,8 +296,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
   if (!is_1x1_) {
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
-     // im2col_gpu_opt(input, bottom_offset_, channels_, height_, width_, kernel_w_, pad_w_, stride_w_,
-        //         (Dtype*)transMem, 0, opt_num2);
     }   
   }
 #ifdef multiQ
@@ -391,8 +389,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 
   if (!is_1x1_) {
       conv_col2im_gpu_opt(input);
-     // col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-       //           stride_w_, input, bottom_offset_, opt_num2);
    }
 }
 
@@ -417,8 +413,6 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   cl_command_queue Queue;
   if (!is_1x1_) {
     conv_im2col_gpu_opt(input);
-   //im2col_gpu_opt(input, bottom_offset_, channels_, height_,
-     //                  width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
   }
     //conv_transpose_gpu(output);
     int height_top = M_ * group_, width_top = N_;
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
deleted file mode 100644
index 104d2b9d..00000000
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// Set to three for the benefit of the backward pass, which
-// can use separate streams for calculating the gradient w.r.t.
-// bias, filter weights, and bottom data for each group independently
-#define CUDNN_STREAMS_PER_GROUP 3
-
-/**
- * TODO(dox) explain cuDNN interface
- */
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDA streams and cuDNN.
-  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  workspaceSizeInBytes = 0;
-  workspace = NULL;
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
-    CUDNN_CHECK(cudnnCreate(&handle_[g]));
-    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
-  }
-
-  // Set the indexing parameters.
-  weight_offset_ = (this->num_output_ / this->group_)
-      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
-  bias_offset_ = (this->num_output_ / this->group_);
-
-  // Create filter descriptor.
-  cudnn::createFilterDesc<Dtype>(&filter_desc_,
-      this->num_output_ / this->group_, this->channels_ / this->group_,
-      this->kernel_h_, this->kernel_w_);
-
-  // Create tensor descriptor(s) for data and corresponding convolution(s).
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnnTensorDescriptor_t bottom_desc;
-    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
-    bottom_descs_.push_back(bottom_desc);
-    cudnnTensorDescriptor_t top_desc;
-    cudnn::createTensor4dDesc<Dtype>(&top_desc);
-    top_descs_.push_back(top_desc);
-    cudnnConvolutionDescriptor_t conv_desc;
-    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
-    conv_descs_.push_back(conv_desc);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
-  }
-
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  bottom_offset_ = (this->channels_ / this->group_)
-      * this->height_ * this->width_;
-  top_offset_ = (this->num_output_ / this->group_)
-      * this->height_out_ * this->width_out_;
-
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
-        this->num_,
-        this->channels_ / this->group_,
-        this->height_, this->width_,
-        this->channels_ * this->height_ * this->width_,
-        this->height_ * this->width_,
-        this->width_, 1);
-    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
-        this->num_,
-        this->num_output_ / this->group_,
-        this->height_out_, this->width_out_,
-        this->num_output_ * this->height_out_ * this->width_out_,
-        this->height_out_ * this->width_out_,
-        this->width_out_, 1);
-    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, this->pad_h_, this->pad_w_,
-        this->stride_h_, this->stride_w_);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
-        1, this->num_output_ / this->group_, 1, 1);
-  }
-}
-
-template <typename Dtype>
-CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  for (int i = 0; i < bottom_descs_.size(); i++) {
-    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
-    cudnnDestroyTensorDescriptor(top_descs_[i]);
-    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
-  }
-  if (this->bias_term_) {
-    cudnnDestroyTensorDescriptor(bias_desc_);
-  }
-  cudnnDestroyFilterDescriptor(filter_desc_);
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    cudaStreamDestroy(stream_[g]);
-    cudnnDestroy(handle_[g]);
-  }
-
-  delete [] stream_;
-  delete [] handle_;
-}
-
-INSTANTIATE_CLASS(CuDNNConvolutionLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
deleted file mode 100644
index c92c4e47..00000000
--- a/src/caffe/layers/cudnn_pooling_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
-      this->layer_param_.pooling_param().pool(), &mode_,
-      this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_,
-      this->stride_h_, this->stride_w_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->pooled_height_, this->pooled_width_);
-}
-
-template <typename Dtype>
-CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroyPoolingDescriptor(pooling_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNPoolingLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
deleted file mode 100644
index 759d8398..00000000
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
deleted file mode 100644
index 32637873..00000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSigmoidLayer<Dtype>::~CuDNNSigmoidLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
deleted file mode 100644
index 77a3225a..00000000
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDNN.
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::Reshape(bottom, top);
-  int N = this->outer_num_;
-  int K = bottom[0]->shape(this->softmax_axis_);
-  int H = this->inner_num_;
-  int W = 1;
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSoftmaxLayer<Dtype>::~CuDNNSoftmaxLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
deleted file mode 100644
index 376faad3..00000000
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNTanHLayer<Dtype>::~CuDNNTanHLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/absval_layer.cu b/src/caffe/layers/cufiles/absval_layer.cu
deleted file mode 100644
index bb310e1a..00000000
--- a/src/caffe/layers/cufiles/absval_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
-}
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_sign(count, bottom_data, bottom_diff);
-    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/base_data_layer.cu b/src/caffe/layers/cufiles/base_data_layer.cu
deleted file mode 100644
index 9335a5bc..00000000
--- a/src/caffe/layers/cufiles/base_data_layer.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <vector>
-
-#include "caffe/data_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(this->prefetch_data_);
-  // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-      top[0]->mutable_gpu_data());
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
-    // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-        top[1]->mutable_gpu_data());
-  }
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-}
-
-INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/bnll_layer.cu b/src/caffe/layers/cufiles/bnll_layer.cu
deleted file mode 100644
index d963d068..00000000
--- a/src/caffe/layers/cufiles/bnll_layer.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-const float kBNLL_THRESHOLD = 50.;
-
-template <typename Dtype>
-__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ?
-        in[index] + log(1. + exp(-in[index])) :
-        log(1. + exp(in[index]));
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void BNLLBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD)));
-    out_diff[index] = in_diff[index] * expval / (expval + 1.);
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    BNLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/concat_layer.cu b/src/caffe/layers/cufiles/concat_layer.cu
deleted file mode 100644
index 8f2e85d8..00000000
--- a/src/caffe/layers/cufiles/concat_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Concat(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index +
-        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    if (forward) {
-      out_data[top_index] = in_data[index];
-    } else {
-      out_data[index] = in_data[top_index];
-    }
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/contrastive_loss_layer.cu b/src/caffe/layers/cufiles/contrastive_loss_layer.cu
deleted file mode 100644
index 93123931..00000000
--- a/src/caffe/layers/cufiles/contrastive_loss_layer.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),  // a
-      bottom[1]->gpu_data(),  // b
-      diff_.mutable_gpu_data());  // a_i-b_i
-  caffe_gpu_powx(
-      count,
-      diff_.mutable_gpu_data(),  // a_i-b_i
-      Dtype(2),
-      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-  caffe_gpu_gemv(
-      CblasNoTrans,
-      bottom[0]->num(),
-      bottom[0]->channels(),
-      Dtype(1.0),
-      diff_sq_.gpu_data(),  // (a_i-b_i)^2
-      summer_vec_.gpu_data(),
-      Dtype(0.0),
-      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
-                              Dtype(0.0));
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-__global__ void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff) {
-  CUDA_KERNEL_LOOP(i, count) {
-    int n = i / channels;  // the num index, to access y and dist_sq
-    if (static_cast<int>(y[n])) {  // similar pairs
-      bottom_diff[i] = alpha * diff[i];
-    } else {  // dissimilar pairs
-      Dtype mdist(0.0);
-      Dtype beta(0.0);
-      if (legacy_version) {
-        mdist = (margin - dist_sq[n]);
-        beta = -alpha;
-      } else {
-        Dtype dist = sqrt(dist_sq[n]);
-        mdist = (margin - dist);
-        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-      }
-      if (mdist > 0.0) {
-        bottom_diff[i] = beta;
-      } else {
-        bottom_diff[i] = 0;
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const int count = bottom[0]->count();
-      const int channels = bottom[0]->channels();
-      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-      const bool legacy_version =
-          this->layer_param_.contrastive_loss_param().legacy_version();
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[0]->num());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, legacy_version, alpha,
-          bottom[2]->gpu_data(),  // pair similarity 0 or 1
-          diff_.gpu_data(),  // the cached eltwise difference between a and b
-          dist_sq_.gpu_data(),  // the cached square distance between a and b
-          bottom[i]->mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/conv_layer.cu b/src/caffe/layers/cufiles/conv_layer.cu
deleted file mode 100644
index b8a98ff7..00000000
--- a/src/caffe/layers/cufiles/conv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/cudnn_conv_layer.cu b/src/caffe/layers/cufiles/cudnn_conv_layer.cu
deleted file mode 100644
index b4e802e1..00000000
--- a/src/caffe/layers/cufiles/cudnn_conv_layer.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-__global__ void sync_conv_groups() { }
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const Dtype* weight = this->blobs_[0]->gpu_data();
-
-    size_t workspace_limit_bytes = this->kernel_h_ *
-                                   this->kernel_w_ *
-                                   this->channels_ *
-                                   sizeof(int) + 1;
-
-    // Forward through cuDNN in parallel over groups.
-    for (int g = 0; g < this->group_; g++) {
-      cudnnConvolutionFwdAlgo_t algo;
-
-      // pick the convolution algorithm
-      // TODO(shelhamer) this should be done during reshape
-      // TODO(shelhamer) the choice of automatic or manual algorithm picking
-      // should be exposed in proto
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,  // memoryLimitInBytes,
-        &algo));
-
-      // get minimum size of the workspace needed for the desired algorithm
-      size_t workspaceSizeInBytes_temp = 0;
-
-      CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        algo,
-        &workspaceSizeInBytes_temp));
-
-      if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
-        workspaceSizeInBytes = workspaceSizeInBytes_temp;
-        // free the existing workspace and allocate a new (larger) one
-        cudaFree(this->workspace);
-        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
-        if (err != cudaSuccess) {
-          // force zero memory path
-          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-          workspace = NULL;
-          workspaceSizeInBytes = 0;
-        }
-      }
-
-      // Filters.
-      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
-            cudnn::dataType<Dtype>::one,
-            bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + weight_offset_ * g,
-            conv_descs_[i],
-            algo, workspace, workspaceSizeInBytes,
-            cudnn::dataType<Dtype>::zero,
-            top_descs_[i], top_data + top_offset_ * g));
-
-      // Bias.
-      if (this->bias_term_) {
-        const Dtype* bias_data = this->blobs_[1]->gpu_data();
-        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
-  if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  }
-  Dtype* bias_diff = NULL;
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Backward through cuDNN in parallel over groups and gradients.
-    for (int g = 0; g < this->group_; g++) {
-      // Gradient w.r.t. bias.
-      if (this->bias_term_ && this->param_propagate_down_[1]) {
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i],  top_diff + top_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_diff + bias_offset_ * g));
-      }
-
-      // Gradient w.r.t. weights.
-      if (this->param_propagate_down_[0]) {
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              bottom_descs_[i], bottom_data + bottom_offset_ * g,
-              top_descs_[i],    top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + weight_offset_ * g));
-      }
-
-      // Gradient w.r.t. bottom data.
-      if (propagate_down[i]) {
-        if (weight == NULL) {
-          weight = this->blobs_[0]->gpu_data();
-        }
-        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + weight_offset_ * g,
-              top_descs_[i], top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::zero,
-              bottom_descs_[i], bottom_diff + bottom_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu b/src/caffe/layers/cufiles/cudnn_pooling_layer.cu
deleted file mode 100644
index a952b855..00000000
--- a/src/caffe/layers/cufiles/cudnn_pooling_layer.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data, top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_relu_layer.cu b/src/caffe/layers/cufiles/cudnn_relu_layer.cu
deleted file mode 100644
index 21d14857..00000000
--- a/src/caffe/layers/cufiles/cudnn_relu_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Forward_gpu(bottom, top);
-  }
-
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Backward_gpu(top, propagate_down, bottom);
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu b/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
deleted file mode 100644
index 7a06cf72..00000000
--- a/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu b/src/caffe/layers/cufiles/cudnn_softmax_layer.cu
deleted file mode 100644
index a9e2fcef..00000000
--- a/src/caffe/layers/cufiles/cudnn_softmax_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_CHANNEL,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-    CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE,
-          CUDNN_SOFTMAX_MODE_CHANNEL,
-          cudnn::dataType<Dtype>::one,
-          top_desc_, top_data, top_desc_, top_diff,
-          cudnn::dataType<Dtype>::zero,
-          bottom_desc_, bottom_diff));
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu b/src/caffe/layers/cufiles/cudnn_tanh_layer.cu
deleted file mode 100644
index d287f6fe..00000000
--- a/src/caffe/layers/cufiles/cudnn_tanh_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cufiles/deconv_layer.cu b/src/caffe/layers/cufiles/deconv_layer.cu
deleted file mode 100644
index 39bc4de8..00000000
--- a/src/caffe/layers/cufiles/deconv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/dropout_layer.cu b/src/caffe/layers/cufiles/dropout_layer.cu
deleted file mode 100644
index f9ea04f4..00000000
--- a/src/caffe/layers/cufiles/dropout_layer.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-
-template <typename Dtype>
-__global__ void DropoutForward(const int n, const Dtype* in,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] * (mask[index] > threshold) * scale;
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    unsigned int* mask =
-        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
-    caffe_gpu_rng_uniform(count, mask);
-    // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, mask, uint_thres_, scale_, top_data);
-    CUDA_POST_KERNEL_CHECK;
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-  }
-}
-
-template <typename Dtype>
-__global__ void DropoutBackward(const int n, const Dtype* in_diff,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    if (this->phase_ == TRAIN) {
-      const unsigned int* mask =
-          static_cast<const unsigned int*>(rand_vec_.gpu_data());
-      const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
-      CUDA_POST_KERNEL_CHECK;
-    } else {
-      caffe_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/eltwise_layer.cu b/src/caffe/layers/cufiles/eltwise_layer.cu
deleted file mode 100644
index 2247870d..00000000
--- a/src/caffe/layers/cufiles/eltwise_layer.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    if (bottom_data_a[index] > bottom_data_b[index]) {
-      // only update for very first bottom_data blob (blob_idx == 0)
-      if (blob_idx == 0) {
-        maxval = bottom_data_a[index];
-        top_data[index] = maxval;
-        maxidx = blob_idx;
-        mask[index] = maxidx;
-      }
-    } else {
-      maxval = bottom_data_b[index];
-      top_data[index] = maxval;
-      maxidx = blob_idx + 1;
-      mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-        top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    mask = max_idx_.mutable_gpu_data();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxForward<Dtype> <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
-    for (int i = 2; i < bottom.size(); ++i) {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
-}
-
-template <typename Dtype>
-__global__ void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype gradient = 0;
-    if (mask[index] == blob_idx) {
-      gradient += top_diff[index];
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-                            bottom_diff);
-            }
-          }
-        } else {
-          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.gpu_data();
-        MaxBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-            count, top_diff, i, mask, bottom_diff);
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/euclidean_loss_layer.cu b/src/caffe/layers/cufiles/euclidean_loss_layer.cu
deleted file mode 100644
index 5b1de3ad..00000000
--- a/src/caffe/layers/cufiles/euclidean_loss_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_gpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_gpu_diff());  // b
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/exp_layer.cu b/src/caffe/layers/cufiles/exp_layer.cu
deleted file mode 100644
index 2d75d8dd..00000000
--- a/src/caffe/layers/cufiles/exp_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_gpu_exp(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_gpu_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, outer_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, inner_scale_, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/filter_layer.cu b/src/caffe/layers/cufiles/filter_layer.cu
deleted file mode 100644
index cf929eee..00000000
--- a/src/caffe/layers/cufiles/filter_layer.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->gpu_data();
-    Dtype* top_data = top[t]->mutable_gpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * dim;
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
-}
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); ++n) {
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          data_offset_bottom = n * dim;
-          caffe_gpu_set(dim, Dtype(0),
-              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          data_offset_bottom = n * dim;
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_gpu_set(dim, Dtype(0),
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            ++next_to_backward_offset;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/hdf5_data_layer.cu b/src/caffe/layers/cufiles/hdf5_data_layer.cu
deleted file mode 100644
index 5e3e4ced..00000000
--- a/src/caffe/layers/cufiles/hdf5_data_layer.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
-*/
-
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/data_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    }
-    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-      int data_dim = top[j]->count() / top[j]->shape(0);
-      caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/hdf5_output_layer.cu b/src/caffe/layers/cufiles/hdf5_output_layer.cu
deleted file mode 100644
index ae497c34..00000000
--- a/src/caffe/layers/cufiles/hdf5_output_layer.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
-
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim],
-        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-    caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim],
-        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
-  }
-  SaveBlobs();
-}
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  return;
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/im2col_layer.cu b/src/caffe/layers/cufiles/im2col_layer.cu
deleted file mode 100644
index 9c338b14..00000000
--- a/src/caffe/layers/cufiles/im2col_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
-  }
-}
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/inner_product_layer.cu b/src/caffe/layers/cufiles/inner_product_layer.cu
deleted file mode 100644
index d93560a0..00000000
--- a/src/caffe/layers/cufiles/inner_product_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
-      bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
-  if (bias_term_) {
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),0,
-        this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
-  }
-}
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    // Gradient with respect to weight
-    caffe_gpu_gemm_ex<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bias
-    caffe_gpu_gemvv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
-        (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_->gpu_data()),
-         (size_t)0, (Dtype)0., 1,
-        this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm_ex<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
-        bottom[0]->mutable_gpu_diff(), 0);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/log_layer.cu b/src/caffe/layers/cufiles/log_layer.cu
deleted file mode 100644
index 847c86cd..00000000
--- a/src/caffe/layers/cufiles/log_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/neuron_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_gpu_log(count, bottom_data, top_data);
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_gpu_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, base_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-    const int count = bottom[0]->count();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, bottom_data, bottom_diff);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, bottom_diff);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-    }
-    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-    if (backward_num_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(LogLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/lrn_layer.cu b/src/caffe/layers/cufiles/lrn_layer.cu
deleted file mode 100644
index 001b3c34..00000000
--- a/src/caffe/layers/cufiles/lrn_layer.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const in_off = in + offset;
-    Dtype* const scale_off = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
-}
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_gpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-// TODO: check if it would be faster to just put it into the previous kernel.
-template <typename Dtype>
-__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  CUDA_POST_KERNEL_CHECK;
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, scale_data, -beta_, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void LRNLayer<float>::CrossChannelForward_gpu(
-    const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top);
-template void LRNLayer<double>::CrossChannelForward_gpu(
-    const vector<Blob<double>*>& bottom, const vector<Blob<double>*>& top);
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_gpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-template <typename Dtype>
-__global__ void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const bottom_off = bottom_data + offset;
-    const Dtype* const top_off = top_data + offset;
-    const Dtype* const scale_off = scale + offset;
-    const Dtype* const top_diff_off = top_diff + offset;
-    Dtype* const bottom_diff_off = bottom_diff + offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
-}
-template void LRNLayer<float>::CrossChannelBackward_gpu(
-    const vector<Blob<float>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<float>*>& bottom);
-template void LRNLayer<double>::CrossChannelBackward_gpu(
-    const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<double>*>& bottom);
-
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/mvn_layer.cu b/src/caffe/layers/cufiles/mvn_layer.cu
deleted file mode 100644
index 3888a0c7..00000000
--- a/src/caffe/layers/cufiles/mvn_layer.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-        sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E(X^2)
-    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-        temp_.mutable_gpu_data());  // (EX)^2
-    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-        variance_.mutable_gpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-
-    // normalize variance
-    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-          variance_.mutable_gpu_data());
-
-    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-  }
-}
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          bottom_diff);
-    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-            bottom_diff);
-
-    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-  } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/pooling_layer.cu b/src/caffe/layers/cufiles/pooling_layer.cu
deleted file mode 100644
index ca4b13f7..00000000
--- a/src/caffe/layers/cufiles/pooling_layer.cu
+++ /dev/null
@@ -1,387 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxPoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data, int* mask, Dtype* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + kernel_h, height + pad_h);
-    int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (hend - hstart) * (wend - wstart);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    Dtype aveval = 0;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = aveval / pool_size;
-  }
-}
-
-template <typename Dtype>
-__global__ void StoPoolForwardTrain(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const rand_idx, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    Dtype cumsum = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_slice[h * width + w];
-          return;
-        }
-      }
-    }
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolForwardTest(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
-    Dtype cumvalues = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;
-  Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_gpu_data();
-    } else {
-      mask = max_idx_.mutable_gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    if (this->phase_ == TRAIN) {
-      // We need to create the random index as well.
-      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-                            rand_idx_.mutable_gpu_data());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                   CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top_data);
-    } else {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                  CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-template <typename Dtype>
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int* const mask, const Dtype* const top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice = top_diff + offset;
-    if (mask) {
-      const int* const mask_slice = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      const Dtype* const top_mask_slice = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int hstart = ph * stride_h - pad_h;
-        int wstart = pw * stride_w - pad_w;
-        int hend = min(hstart + kernel_h, height + pad_h);
-        int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolBackward(const int nthreads,
-    const Dtype* const rand_idx, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const rand_idx_slice =
-        rand_idx + (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        gradient += top_diff_slice[ph * pooled_width + pw] *
-            (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int count = bottom[0]->count();
-  caffe_gpu_set(count, Dtype(0.), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->gpu_data();
-    } else {
-      mask = max_idx_.gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    StoPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-        bottom_diff);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/power_layer.cu b/src/caffe/layers/cufiles/power_layer.cu
deleted file mode 100644
index 90d94405..00000000
--- a/src/caffe/layers/cufiles/power_layer.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_gpu_set(count, value, top_data);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
-}
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_gpu_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = bottom[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/prelu_layer.cu b/src/caffe/layers/cufiles/prelu_layer.cu
deleted file mode 100644
index e1f20048..00000000
--- a/src/caffe/layers/cufiles/prelu_layer.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// CUDA kernele for forward
-template <typename Dtype>
-__global__ void PReLUForward(const int n, const int channels, const int dim,
-    const Dtype* in, Dtype* out, const Dtype* slope_data,
-    const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-  }
-}
-
-// CUDA kernel for bottom backward
-template <typename Dtype>
-__global__ void PReLUBackward(const int n, const int channels, const int dim,
-    const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff,
-    const Dtype* slope_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * slope_data[c]);
-  }
-}
-
-// CUDA kernel for element-wise parameter backward
-template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-  }
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  PReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, channels, dim, bottom_data, top_data, slope_data, div_factor);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-    Dtype dsum = 0.;
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-          CAFFE_CUDA_NUM_THREADS>>>(
-          cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n),
-          backward_buff_.mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-      if (channel_shared_) {
-        Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        dsum += d;
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-            slope_diff);
-      }
-    }
-    if (channel_shared_) {
-      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-        count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
-        div_factor);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/reduction_layer.cu b/src/caffe/layers/cufiles/reduction_layer.cu
deleted file mode 100644
index 2dbd3bc9..00000000
--- a/src/caffe/layers/cufiles/reduction_layer.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.gpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_gpu_data();
-    caffe_gpu_scal(num_, coeff_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/relu_layer.cu b/src/caffe/layers/cufiles/relu_layer.cu
deleted file mode 100644
index b8924c85..00000000
--- a/src/caffe/layers/cufiles/relu_layer.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out,
-    Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data, negative_slope);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void ReLUBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * negative_slope);
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff, negative_slope);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
deleted file mode 100644
index 547fa80c..00000000
--- a/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = bottom[1]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/sigmoid_layer.cu b/src/caffe/layers/cufiles/sigmoid_layer.cu
deleted file mode 100644
index e1af0657..00000000
--- a/src/caffe/layers/cufiles/sigmoid_layer.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SigmoidForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void SigmoidBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const Dtype sigmoid_x = out_data[index];
-    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/silence_layer.cu b/src/caffe/layers/cufiles/silence_layer.cu
deleted file mode 100644
index 8d044ee7..00000000
--- a/src/caffe/layers/cufiles/silence_layer.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Do nothing.
-}
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_data());
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/slice_layer.cu b/src/caffe/layers/cufiles/slice_layer.cu
deleted file mode 100644
index 796841d3..00000000
--- a/src/caffe/layers/cufiles/slice_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Slice(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_slices, const int slice_size,
-    const int bottom_slice_axis, const int top_slice_axis,
-    const int offset_slice_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_slice_size = slice_size * top_slice_axis;
-    const int slice_num = index / total_slice_size;
-    const int slice_index = index % total_slice_size;
-    const int bottom_index = slice_index +
-        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
-    if (forward) {
-      out_data[index] = in_data[bottom_index];
-    } else {
-      out_data[bottom_index] = in_data[index];
-    }
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int offset_slice_axis = 0;
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < top.size(); ++i) {
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  int offset_slice_axis = 0;
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/softmax_layer.cu b/src/caffe/layers/cufiles/softmax_layer.cu
deleted file mode 100644
index 1f9c3a41..00000000
--- a/src/caffe/layers/cufiles/softmax_layer.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype dot = 0;
-    for (int c = 0; c < channels; ++c) {
-      dot += (data_1[(n * channels + c) * spatial_dim + s]
-          * data_2[(n * channels + c) * spatial_dim + s]);
-    }
-    channel_dot[index] = dot;
-  }
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = bottom[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  // compute max
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // subtract
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-  // exponentiate
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, top_data, top_data);
-  // sum after exp
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // divide
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = top[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, top_diff, bottom_diff);
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_,
-      top_diff, top_data, scale_data);
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, bottom_diff);
-  // elementwise multiplication
-  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/softmax_loss_layer.cu b/src/caffe/layers/cufiles/softmax_loss_layer.cu
deleted file mode 100644
index 7e0f3da4..00000000
--- a/src/caffe/layers/cufiles/softmax_loss_layer.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      loss[index] = 0;
-      counts[index] = 0;
-    } else {
-      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      Dtype(FLT_MIN)));
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.gpu_data();
-  const Dtype* label = bottom[1]->gpu_data();
-  const int dim = prob_.count() / outer_num_;
-  const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-  // Similarly, this memory is never used elsewhere, and thus we can use it
-  // to avoid having to allocate additional GPU memory.
-  Dtype* counts = prob_.mutable_gpu_diff();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Dtype loss;
-  caffe_gpu_asum(nthreads, loss_data, &loss);
-  if (normalize_) {
-    Dtype count;
-    caffe_gpu_asum(nthreads, counts, &count);
-    loss /= count;
-  } else {
-    loss /= outer_num_;
-  }
-  top[0]->mutable_cpu_data()[0] = loss;
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
-}
-
-template <typename Dtype>
-__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts) {
-  const int channels = dim / spatial_dim;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      for (int c = 0; c < channels; ++c) {
-        bottom_diff[n * dim + c * spatial_dim + s] = 0;
-      }
-      counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* prob_data = prob_.gpu_data();
-    const Dtype* top_data = top[0]->gpu_data();
-    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->gpu_data();
-    const int dim = prob_.count() / outer_num_;
-    const int nthreads = outer_num_ * inner_num_;
-    // Since this memory is never used for anything else,
-    // we use to to avoid allocating new GPU memory.
-    Dtype* counts = prob_.mutable_gpu_diff();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
-        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      Dtype count;
-      caffe_gpu_asum(nthreads, counts, &count);
-      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/split_layer.cu b/src/caffe/layers/cufiles/split_layer.cu
deleted file mode 100644
index a4f5df26..00000000
--- a/src/caffe/layers/cufiles/split_layer.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
-}
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-    return;
-  }
-  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-                bottom[0]->mutable_gpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/tanh_layer.cu b/src/caffe/layers/cufiles/tanh_layer.cu
deleted file mode 100644
index ccd6e63e..00000000
--- a/src/caffe/layers/cufiles/tanh_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// TanH neuron activation function layer.
-// Adapted from ReLU layer code written by Yangqing Jia
-
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = tanh(in[index]);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  TanHForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void TanHBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype tanhx = out_data[index];
-    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    TanHBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cufiles/threshold_layer.cu b/src/caffe/layers/cufiles/threshold_layer.cu
deleted file mode 100644
index bfa7f159..00000000
--- a/src/caffe/layers/cufiles/threshold_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ThresholdForward(const int n, const Dtype threshold,
-    const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > threshold ? 1 : 0;
-  }
-}
-
-template <typename Dtype>
-void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ThresholdForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, threshold_, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer);
-
-
-}  // namespace caffe

From 5c66e9b7eb4da81160ee25b94dcd3b6c89a5d1f8 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 2 Sep 2015 06:12:20 +0800
Subject: [PATCH 051/124] Removed forward_opt and backward_opt functions in
 conv layer

---
 include/caffe/vision_layers.hpp      |   8 --
 src/caffe/layers/base_conv_layer.cpp | 137 ---------------------------
 src/caffe/layers/conv_layer.cpp      |  13 ---
 3 files changed, 158 deletions(-)

diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 2f2d7eef..3ee5a779 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -140,10 +140,6 @@ class BaseConvolutionLayer : public Layer<Dtype> {
 
 //opencl related data structures
 protected:
-  void forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, 
-      const vector<Blob<Dtype>*>& top,  bool skip_im2col = false) ;
-  void backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   int opt_num2;
   int M_, N_, K_;
   int weight_offset_;
@@ -223,12 +219,8 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
   virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 19458185..fc541ef9 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -448,143 +448,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
           bias, (size_t)0, 1);
 }
 
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_opt(const vector<Blob<Dtype>*>& bottom, const Dtype* weight, const vector<Blob<Dtype>*>& top, bool skip_im2col){
-
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-  int M_org = M_ * group_;
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  int weight_offset = M_ * K_;
-  int opt_num2 = global_packing_N;
-  cl_command_queue Queue;
-  cl_event prof_event;
-  for (int n = 0; n < num_; n += opt_num2) {
-    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    top_offset = M_ * N_ * opt_num2;
-    col_offset = K_ * N_ * opt_num2;
-    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
-
-#ifdef multiQ
-    for (int g = 0; g < group_; ++g) {
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
-       }
-     if(group_ == 2){
-       clFinish(amdDevice.CommandQueue);
-       clFinish(amdDevice.CommandQueue_helper);
-     }
-#else
-    Queue = amdDevice.CommandQueue;
-    for (int g = 0; g < group_; ++g) {
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset * g, (Dtype*)transMem, col_offset * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset * g);
-       }
-#endif
-    transform_gpu((Dtype*)subTopMem, top_data, top[i]->offset(n), N_, M_org, opt_num2);
-
-   for (int z = 0; z < opt_num2; z++)
-      if (bias_term_) {
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          N_, 1, (Dtype)1., this->blobs_[1]->gpu_data(), 0,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., top_data, top[i]->offset(n) + num_output_ * N_ * z);
-    }
-  }
-}
-}
-
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
-    for (int n = 0; n < num_; ++n) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_,
-          (Dtype)1., top_diff, top[i]->offset(n), N_,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
-          bias_diff, (size_t)0, 1);
-     }
-   }
-
- if (this->param_propagate_down_[0] || propagate_down[i]) {
-  const Dtype* bottom_data = bottom[i]->gpu_data();
-  Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-  int col_offset = K_ * N_;
-  int top_offset = M_ * N_;
-  int weight_offset = M_ * K_;
-  int opt_num2 = global_packing_N;
-  int g = 0;
-  cl_command_queue Queue;
-  cl_event prof_event;
-
-  for (int n = 0; n < num_; n += opt_num2) {
-    opt_num2 = opt_num2 > (num_ - n)? (num_ - n) : opt_num2;
-    top_offset = M_ * (N_ * opt_num2);
-    col_offset = K_ * (N_ * opt_num2);
-    im2col_gpu_opt(bottom_data, bottom[i]->offset(n), channels_, height_,
-                       width_, kernel_w_, pad_w_, stride_w_, (Dtype*)transMem, 0, opt_num2);
-
-    int height_top = M_ * group_, width_top = N_;
-    opttrans(top_diff, top[i]->offset(n), 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
-
-    for(g = 0; g < group_; ++g) {
-#ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-#else
-       Queue =  amdDevice.CommandQueue;
-#endif
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
-        (Dtype)1., (Dtype*)subTopMem, top_offset * g,
-        (Dtype*)transMem, col_offset * g, (Dtype)1.,
-        (Dtype*)weight_diff, weight_offset * g);
-    }
-
-   if (propagate_down[i]) {
-      for (g = 0; g < group_; ++g) {
-#ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-#else
-       Queue =  amdDevice.CommandQueue;
-#endif
-       prof_event =  caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_*opt_num2, M_,
-          (Dtype)1., weight,  weight_offset * g,
-          (Dtype*)subTopMem, top_offset * g,
-          (Dtype)0., (Dtype*)transMem, col_offset * g);
-      }
-    }
-
-#ifdef multiQ
-   if(group_ ==2){
-      clFinish(amdDevice.CommandQueue);
-      clFinish(amdDevice.CommandQueue_helper);
-    }
-#endif
-       col2im_gpu_opt((Dtype*)transMem, 0, channels_, height_, width_, kernel_w_, pad_w_,
-                  stride_w_, bottom_diff, bottom[i]->offset(n), opt_num2);
-   }
-  }
- }
-}
-
 #endif  // !CPU_ONLY
 
 INSTANTIATE_CLASS(BaseConvolutionLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 0c3a1367..c5bdb02c 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -92,13 +92,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
 }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu_opt(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  this->forward_gpu_opt(bottom, weight, top);
-}
-
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -150,12 +143,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
     }
   }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu_opt(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-      this->backward_gpu_opt(top, propagate_down, bottom);
-}
-
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

From 7474975289f6146c1da7dcd18f184f6cad9638dd Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Fri, 4 Sep 2015 03:03:00 +0800
Subject: [PATCH 052/124] fixed merge conflicts

---
 src/caffe/layers/conv_layer.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c5bdb02c..c829dbd7 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -66,13 +66,14 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
+
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const  vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N >1)
-      Forward_gpu_opt(bottom, top);
+   Forward_gpu_opt2(bottom, top);
   else
    Forward_gpu_org(bottom, top);
 }
@@ -84,12 +85,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       Backward_gpu_opt2(top, propagate_down, bottom);
     else
       Backward_gpu_org(top, propagate_down, bottom);
-//  CHECK_GLOBAL_MEM_DATA(weight_diff, this->blobs_[0]->count(), 20, "weight_diff");
- // CHECK_GLOBAL_MEM_DATA(bottom[0]->mutable_gpu_diff(), bottom[0]->count(), 20, "bottom_diff");
-//  CHECK_GLOBAL_MEM_DATA(top[0]->gpu_diff(), top[0]->count(), 20, "top_diff");
- // CHECK_BLOB_DATA(bottom[0], 20, "bottom[0]");
-
-
 }
 
 template <typename Dtype>
@@ -118,6 +113,10 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& botto
       }
    }
   }
+
+  //CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+
 }
 
 template <typename Dtype>
@@ -143,6 +142,10 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom
     }
   }
 
+  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+}
+
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
@@ -184,8 +187,8 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
       }
     }
   }
-}
 
+}
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

From 8469f863792f53bfb0e479be85a87b6ae1d19b5f Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Fri, 4 Sep 2015 13:56:34 +0800
Subject: [PATCH 053/124] clean up warining info

---
 src/caffe/device.cpp                 |  19 +----
 src/caffe/layers/base_conv_layer.cpp |  10 +--
 src/caffe/solver.cpp                 |   7 ++
 src/caffe/util/im2col.cpp            |  22 ++----
 src/caffe/util/math_functions.cpp    |   2 +
 src/caffe/util/ocl_util.cpp          |  13 ++-
 src/caffe/util/ocl_wrapper.cpp       | 113 ++++++++++-----------------
 7 files changed, 68 insertions(+), 118 deletions(-)

diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index dc47e907..86e63e45 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -33,17 +33,12 @@
 #include <dirent.h>
 
 namespace caffe {
-//delete it after test, Yibing
-cl_mem test_alloc_mem[10];
-extern long long unsigned device_mem_consumption;
-
 char* buildOption = "-x clc++ ";
 //char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64";
 std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
 
 Device::~Device(){
-    //clAmdBlasTeardown(); 
     ReleaseKernels(); 
     free((void*)platformIDs);
     free(DeviceIDs);
@@ -57,7 +52,6 @@ Device::~Device(){
 
 cl_int Device::Init(int deviceId){
 
-    //Get Platform Infomation
     DisplayPlatformInfo();
   
     clGetPlatformIDs(0, NULL, &numPlatforms);
@@ -67,7 +61,7 @@ cl_int Device::Init(int deviceId){
     size_t nameLen;
     cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
     if(res != CL_SUCCESS){
-        fprintf(stderr, "Err: Failed to Get Platform Info\n", res);
+        fprintf(stderr, "Err: Failed to Get Platform Info\n");
         return 0;
     }
     platformName[nameLen] = 0;
@@ -106,20 +100,17 @@ cl_int Device::Init(int deviceId){
         }
    }
 
-    //Create Context
     Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
     if(NULL == Context){
         fprintf(stderr,"Err: Failed to Create Context\n");
         return 0;
     }
-    //Create CommandQueue
     CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
     CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
     if(NULL == CommandQueue || NULL == CommandQueue_helper){
         fprintf(stderr,"Err: Failed to Create Commandqueue\n");
         return 0;
     }
-    //BuildProgram from OpenCL kernel files
     BuildProgram(oclKernelPath);
     row = clblasRowMajor;
     col = clblasColumnMajor;
@@ -128,7 +119,6 @@ cl_int Device::Init(int deviceId){
 
 void Device::BuildProgram(std::string kernel_dir)
 { 
-  //Access opencl kernel files
     std::string strSource = "";
     DIR *ocl_dir;
     struct dirent *dirp;
@@ -159,7 +149,6 @@ void Device::BuildProgram(std::string kernel_dir)
     if(NULL == Program){
         fprintf(stderr,"Err: Failed to create program\n");
     }
-    //Build Program
     cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
     LOG(INFO) << "Build Program";
     if(CL_SUCCESS != iStatus){
@@ -169,8 +158,6 @@ void Device::BuildProgram(std::string kernel_dir)
         std::cout << szBuildLog;
         clReleaseProgram(Program);
     }
-
-  // return Program;
 }
 
 //Use to read OpenCL source code
@@ -225,7 +212,6 @@ void Device::ReleaseKernels()
 
 void Device::DisplayPlatformInfo(){
    cl_int err;
-   size_t size;
 
    err = clGetPlatformIDs (0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <=0)
@@ -323,7 +309,6 @@ void Device::GetDeviceInfo(){
 
 void Device::DeviceQuery()
 {
-    //Get Platform Infomation
     DisplayPlatformInfo();
 
     clGetPlatformIDs(0, NULL, &numPlatforms);
@@ -333,7 +318,7 @@ void Device::DeviceQuery()
     size_t nameLen;
     cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
     if (res != CL_SUCCESS) {
-        fprintf(stderr, "Err: Failed to Get Platform Info\n", res);
+        fprintf(stderr, "Err: Failed to Get Platform Info\n");
         return;
     }
     platformName[nameLen] = 0;
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index fc541ef9..26787393 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -292,7 +292,6 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     const Dtype* weight, Dtype* output, bool skip_im2col) {
   cl_command_queue Queue;
-  cl_event prof_event;
   if (!is_1x1_) {
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
@@ -302,7 +301,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     for (int g = 0; g < group_; ++g) {
        if(g == 0) Queue = amdDevice.CommandQueue;
        else Queue =  amdDevice.CommandQueue_helper;
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
           (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
@@ -313,12 +312,11 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
 #else
     Queue = amdDevice.CommandQueue;
     for (int g = 0; g < group_; ++g) {
-       prof_event = caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
           (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
           (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
        }
 #endif
-   //conv_transform_gpu((Dtype*)subTopMem, output);
    transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2);
 }
 
@@ -414,9 +412,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   if (!is_1x1_) {
     conv_im2col_gpu_opt(input);
   }
-    //conv_transpose_gpu(output);
-    int height_top = M_ * group_, width_top = N_;
-    opttrans(output, top_offset_, 1, height_top, width_top, (Dtype*)subTopMem, 0, opt_num2);
+    opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 
 
   for (int g = 0; g < group_; ++g) {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index f4b57a41..dde98baf 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -209,6 +209,13 @@ void Solver<Dtype>::Step(int iters) {
             blob->mutable_gpu_diff());
 #else
         NO_GPU;
+#endif
+      case Caffe::APU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
+#else
+        NO_GPU;
 #endif
         break;
       }
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 29c6c1f9..089023b7 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -63,7 +63,6 @@ void im2col_cpu(const Dtype* data_im, const int channels,
   }
 }
 
-// Explicit instantiation
 template void im2col_cpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
@@ -99,7 +98,6 @@ void col2im_cpu(const Dtype* data_col, const int channels,
   }
 }
 
-// Explicit instantiation
 template void col2im_cpu<float>(const float* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h,
@@ -137,7 +135,7 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int chann
     ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
@@ -149,7 +147,6 @@ template void col2im_gpu_opt<double>(const double* data_col, const int col_offse
     const int height, const int width, const int ksize, const int pad,
     const int stride, double* data_im, const int img_offset, int optnum);
 
-//cannot use now, need to modify kernel.
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, 
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -182,10 +179,9 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
     ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col);
     ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-    clFinish(amdDevice.CommandQueue);
 
 }
 
@@ -198,7 +194,6 @@ template void im2col_gpu<double>(const double* data_im, const int img_offset, co
     				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     				double* data_col, const int col_offset);
 
-//cannot use now, need to modify kernel
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset,
     const int height, const int width, const int channels,
@@ -232,7 +227,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset,
     ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im);
     ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
@@ -270,7 +265,7 @@ void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, co
     ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col);
     ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
     clFinish(amdDevice.CommandQueue);
@@ -312,8 +307,8 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channe
     ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
-    size_t uiLocal_Work_Size[] = {256 - 256 % width_col};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
+    size_t uiLocal_Work_Size[] = {(size_t)(256 - 256 % width_col)};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
 
@@ -334,9 +329,6 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
     int height_col = (height + 2 * pad - ksize) / stride + 1;
     int width_col = (width + 2 * pad - ksize) / stride + 1;
     int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operatiors)
 
     cl_int ret;
     ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
@@ -354,7 +346,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
     ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index d48ec01a..63b449da 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -720,11 +720,13 @@ void popcll_kernel(const int n, const double* a,
 template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
                                   const float* y) {
+	return 0;
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
                                    const double* y) {
+	return 0;
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index e4fd42c6..7f9631e2 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -44,39 +44,36 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count){
     err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
     OCL_CHECK(err);
  
-    size_t Global_Work_Size[1] = {count};
+    size_t Global_Work_Size[1] = {(size_t)count};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
-// Explicit instantiation
 template void ocl_memset<int>(int* buffer, const int value, const int count);
 template void ocl_memset<float>(float* buffer, const float value, const int count);
 template void ocl_memset<double>(double* buffer, const double value, const int count);
 
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){
-    cl_int err=0;
+    cl_int err;
     err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
     err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value);
     err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
     OCL_CHECK(err);
 
-    size_t Global_Work_Size[] = {count};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
 void eventCallback(cl_event event, cl_int event_status, void* user_data){
-    printf("The calling\n");
-    int err = 0;
     cl_ulong ev_start_time = (cl_ulong)0;
     cl_ulong ev_end_time = (cl_ulong)0;
     double run_time;
-    err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL);
-    err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
+    OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL) );
+    OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL) );
     run_time = (double)(ev_end_time - ev_start_time);
     printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
 }
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 9eab08ec..6294cce3 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -35,21 +35,6 @@
 namespace caffe {
 typedef unsigned int uint32_t;
 struct array4x32 {  uint32_t v[4]; };
-/*
-template <typename dtype> inline std::string get_dtype_suffix()
-{
-    dtype x;
-    const char type = typeid(x).name()[0];
-    std::string suffix;
-    switch(type){
-        case 'i': suffix = "_int"; break;
-        case 'd': suffix = "_double"; break;
-        case 'f': 
-        default: suffix = "_float";
-    }
-    return suffix;
-}
-*/
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold)
 {
@@ -87,19 +72,14 @@ void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, c
 
     cl_int ret;
     ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src);
-    OCL_CHECK(ret);
     ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst);
-    OCL_CHECK(ret);
     ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset);
-    OCL_CHECK(ret);
     ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_);
-    OCL_CHECK(ret);
     ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_);
-    OCL_CHECK(ret);
     ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size2[]={M_ * packing_num};
+    size_t uiGlobal_Work_Size2[]={(size_t)(M_ * packing_num)};
     size_t uiLocal_Work_Size2[]={256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) );
 }
@@ -114,12 +94,11 @@ void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bo
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) );
     OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) );
  
-    size_t Global_Work_Size[1] = {num};
+    size_t Global_Work_Size[1] = {(size_t)num};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
 template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data);
 template void get_max_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data);
 
@@ -130,12 +109,11 @@ void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
 
-    size_t Global_Work_Size[1] = {num};
+    size_t Global_Work_Size[1] = {(size_t)num};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
 template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data, float* out);
 template void exp_gpu<double>(cl_kernel Kernel, const int num, const double* data, double* out);
 
@@ -146,12 +124,11 @@ void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) );
     OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
 
-    size_t Global_Work_Size[1] = {num*dim};
+    size_t Global_Work_Size[1] = {(size_t) (num * dim)};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
 template void softmax_div_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data);
 template void softmax_div_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data);
 
@@ -175,7 +152,6 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* p
     return loss;
 }
 
-// Explicit instantiation
 template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss);
 template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
@@ -192,7 +168,7 @@ void kernel_channel_max(const int num, const int channels,
     OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) );
 
-    size_t Global_Work_Size[1] = {num*spatial_dim};
+    size_t Global_Work_Size[1] = {(size_t) (num*spatial_dim)};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -217,7 +193,7 @@ void kernel_channel_subtract( const int count,
     OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) );
     OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
 
-    size_t Global_Work_Size[1] = {count};
+    size_t Global_Work_Size[1] = {(size_t)count};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -239,7 +215,7 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out)
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
 
-    size_t Global_Work_Size[1] = {count};
+    size_t Global_Work_Size[1] = {(size_t)count};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -260,7 +236,7 @@ void kernel_channel_sum(const int num, const int channels,
     OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
 
-    size_t Global_Work_Size[1] = {num*channels};
+    size_t Global_Work_Size[1] = {(size_t)(num*channels)};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -282,7 +258,7 @@ void kernel_channel_div(const int count, const int num, const int channels,
     OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
     OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
 
-    size_t Global_Work_Size[1] = {count};
+    size_t Global_Work_Size[1] = {(size_t)count};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -307,7 +283,7 @@ void kernel_channel_dot(const int num, const int channels,
     OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) );
     OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) );
       
-    size_t Global_Work_Size[1] = {num*spatial_dim};
+    size_t Global_Work_Size[1] = {(size_t)(num*spatial_dim)};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
@@ -339,7 +315,7 @@ void SoftmaxLossForwardGPU(const int nthreads,
     OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
     OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
     
-   size_t Global_Work_Size[1] = {nthreads};
+   size_t Global_Work_Size[1] = {(size_t)nthreads};
    size_t Local_Work_Size[1] = {256};
    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -369,7 +345,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
     OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
     OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
 
-   size_t Global_Work_Size[1] = {nthreads};
+   size_t Global_Work_Size[1] = {(size_t)nthreads};
    size_t Local_Work_Size[1] = {256};
    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -385,12 +361,11 @@ void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){
     OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) );
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
 
-    size_t Global_Work_Size[1] = {num};
+    size_t Global_Work_Size[1] = {(size_t)num};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
 template void scal_gpu<float>(cl_kernel Kernel, const int num, const float alpha, float* data);
 template void scal_gpu<double>(cl_kernel Kernel, const int num, const double alpha, double* data);
 
@@ -401,12 +376,11 @@ void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype
     OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
     OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) );
 
-    size_t Global_Work_Size[1] = {num};
+    size_t Global_Work_Size[1] = {(size_t)num};
     size_t Local_Work_Size[1] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
 }
 
-// Explicit instantiation
 template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim, float* data, const float* label);
 template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim, double* data, const double* label);
 
@@ -426,7 +400,7 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
     ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -459,7 +433,7 @@ void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
     ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -489,7 +463,7 @@ void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int cl
     ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -517,7 +491,7 @@ void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int cln
     ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
@@ -547,7 +521,7 @@ void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
     ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {count * 1};
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -571,7 +545,7 @@ void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
     ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {count * 1};
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -597,7 +571,7 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data
     ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {count};
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -629,7 +603,7 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int*
     ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -661,7 +635,7 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int
     ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -689,7 +663,7 @@ void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtyp
     ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {nthreads};
+    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
@@ -713,7 +687,7 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, c
     ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[]={count};
+    size_t uiGlobal_Work_Size[]={(size_t)count};
     size_t uiLocal_Work_Size[]={256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL));
 }
@@ -734,7 +708,7 @@ void PReLUForward(const int count, const int channels, const int dim, const Dtyp
     ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data);
     ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data);
     ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor);
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -754,7 +728,7 @@ void PReLUBackward(const int count, const int channels, const int dim, const Dty
     ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff);
     ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data);
     ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor);
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -770,7 +744,7 @@ void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bot
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -788,7 +762,7 @@ void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dty
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
     ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {count * 1};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -809,7 +783,7 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da
     ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {count};
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
@@ -823,9 +797,6 @@ void opttrans(const Dtype* data_im, const int im_offset, const int channels,
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
     int num_kernels = channels * height * width * optnum;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operatiors)
 
     cl_int ret;
     ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
@@ -839,7 +810,7 @@ void opttrans(const Dtype* data_im, const int im_offset, const int channels,
     ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
     OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {num_kernels};
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
     size_t uiLocal_Work_Size[] = {256};
     OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
 }
@@ -866,7 +837,7 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in
   ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k);
   ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
   OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={nthreads};
+  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
   size_t uiLocal_Work_Size[]={256};
   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) );
 }
@@ -889,7 +860,7 @@ void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
   ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta);
   ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
   OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size2[]={nthreads};
+  size_t uiGlobal_Work_Size2[]={(size_t)nthreads};
   size_t uiLocal_Work_Size2[]={256};
   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
 }
@@ -920,7 +891,7 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
   ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio);
   ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
   OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={nthreads};
+  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
   size_t uiLocal_Work_Size[]={256};
   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) );
 }
@@ -945,7 +916,7 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype*
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
+    size_t Global_Work_Size[] = {(size_t)n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -960,7 +931,7 @@ void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {N};
+    size_t Global_Work_Size[] = {(size_t)N};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -978,7 +949,7 @@ void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
+    size_t Global_Work_Size[] = {(size_t)n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -995,7 +966,7 @@ void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
+    size_t Global_Work_Size[] = {(size_t)n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -1014,7 +985,7 @@ void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
+    size_t Global_Work_Size[] = {(size_t)n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -1032,7 +1003,7 @@ void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
     ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
     ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
     OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {n};
+    size_t Global_Work_Size[] = {(size_t)n};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -1054,7 +1025,7 @@ void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMe
     ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
@@ -1077,7 +1048,7 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
     ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
     OCL_CHECK(ret);
 
-    size_t Global_Work_Size[] = {count};
+    size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }

From 9cf71bbc7505cb794dadde0d5bd6e47399c421ec Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Fri, 4 Sep 2015 15:58:04 +0800
Subject: [PATCH 054/124] Remove the annotation code

---
 include/caffe/util/device_alternate.hpp | 59 -----------------------
 include/caffe/util/math_functions.hpp   |  3 --
 include/caffe/util/ocl_wrapper.hpp      |  2 -
 src/caffe/common.cpp                    | 40 +---------------
 src/caffe/device.cpp                    |  1 -
 src/caffe/layers/base_data_layer.cpp    | 15 ------
 src/caffe/layers/dropout_layer.cpp      | 13 -----
 src/caffe/layers/pooling_layer.cpp      | 13 -----
 src/caffe/layers/relu_layer.cpp         | 13 -----
 src/caffe/layers/window_data_layer.cpp  | 33 -------------
 src/caffe/net.cpp                       |  1 -
 src/caffe/solver.cpp                    | 16 -------
 src/caffe/util/benchmark.cpp            | 14 ------
 src/caffe/util/im2col.cu                | 10 ----
 src/caffe/util/math_functions.cpp       |  2 -
 src/caffe/util/ocl_wrapper.cpp          | 63 -------------------------
 16 files changed, 1 insertion(+), 297 deletions(-)

diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 9184f4f9..bf5d7705 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -31,70 +31,11 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 
 #else  // Normal GPU + CPU Caffe.
 
-//#include <cublas_v2.h>
-//#include <cuda.h>
-//#include <cuda_runtime.h>
-//#include <curand.h>
-//#include <driver_types.h>  // cuda driver types
 #ifdef USE_CUDNN  // cuDNN acceleration library.
 #include "caffe/util/cudnn.hpp"
 #endif
 
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-/*
-#define CUDA_CHECK(condition) \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << caffe::cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << caffe::curandGetErrorString(status); \
-  } while (0)
-
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: check for error after kernel execution and exit loudly if there is one.
-#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
-*/
 namespace caffe {
-
-// CUDA: library error reporting.
-//const char* cublasGetErrorString(cublasStatus_t error);
-//const char* curandGetErrorString(curandStatus_t error);
-
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CAFFE_CUDA_NUM_THREADS = 1024;
-#else
-    const int CAFFE_CUDA_NUM_THREADS = 512;
-#endif
-
-// CUDA: number of blocks for threads.
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-}
-
 }  // namespace caffe
 
 #endif  // CPU_ONLY
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 381dd8fd..46949ff3 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -160,9 +160,6 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-//template <typename Dtype>
-//void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, Dtype* scratch_buf);
-//CUDA version, need to be deleted
 template <typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 223e3278..db19e1b2 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -30,8 +30,6 @@
 namespace caffe {
 
 typedef unsigned int uint32_t;
-//template <typename Dtype>
-//void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
 template <typename dtype> inline std::string get_dtype_suffix()
 {
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 83afe272..3e4e0dc0 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -12,7 +12,6 @@ shared_ptr<Caffe> Caffe::singleton_;
 // random seeding
 int64_t cluster_seedgen(void) {
  //To fix: for now we use fixed seed to get same result each time
-  /*
   int64_t s, seed, pid;
   FILE* f = fopen("/dev/urandom", "rb");
   if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
@@ -28,8 +27,7 @@ int64_t cluster_seedgen(void) {
   pid = getpid();
   s = time(NULL);
   seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  return seed;
-  */
+  //return seed;
   LOG(WARNING) << "return fixed seed 37";
   return 37;
 }
@@ -93,21 +91,6 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
 {
-/*    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU) {
-  // Try to create a cublas handler, and report an error if failed (but we will
-  // keep the program running as one might just want to run CPU code).
-  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
-  }
-  // Try to create a curand handler.
-  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
-      != CURAND_STATUS_SUCCESS ||
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
-      != CURAND_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
-  }
-*/
 #ifndef CPU_ONLY
    cl_int err =  clblasSetup();
    if(err != CL_SUCCESS){
@@ -117,33 +100,12 @@ Caffe::Caffe()
 }
 
 Caffe::~Caffe() {
- /* if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  }
-*/
 #ifndef CPU_ONLY
    clblasTeardown();
 #endif
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-  // Curand seed
- /* static bool g_curand_availability_logged = false;
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(),
-        seed));
-    CURAND_CHECK(curandSetGeneratorOffset(curand_generator(), 0));
-  } else {
-    if (!g_curand_availability_logged) {
-        LOG(ERROR) <<
-            "Curand not available. Skipping setting the curand seed.";
-        g_curand_availability_logged = true;
-    }
-  }
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
-*/
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 86e63e45..df2de2e0 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -34,7 +34,6 @@
 
 namespace caffe {
 char* buildOption = "-x clc++ ";
-//char* buildOption = "-x clc++, -hsail-reg-slots=8-Wb, -hsail-reg32-pressure-limit=64-Wb, -hsail-reg64-pressure-limit=64";
 std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
 
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 71f5c132..5ba0f2e5 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -87,8 +87,6 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
   
   JoinPrefetchThread();
   DLOG(INFO) << "Thread joined";
-  // Copy the data from prefetch thread to data_layer
-   //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_data_->gpu_data(), (cl_mem) (*top)[0]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_data_->count(), 0, NULL, NULL) );
   
    top[0]->ReshapeLike(this->prefetch_data_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
@@ -97,29 +95,16 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bo
        // Reshape to loaded labels.
    top[1]->ReshapeLike(prefetch_label_);
    OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
-   //OCL_CHECK( clEnqueueCopyBuffer (amdDevice.CommandQueue, (cl_mem) prefetch_label_->gpu_data(), (cl_mem) (*top)[1]->mutable_gpu_data(), 0, 0, sizeof(Dtype)*prefetch_label_->count(), 0, NULL, NULL) );
    }
   
-//  clFinish(amdDevice.CommandQueue);
-
 #ifdef Track_data_transfer
 #endif
   
-//  CHECK_BLOB_DATA(top[0], 20, "top[0]");  
-
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
-  //return Dtype(0.);
 }
 
-/*template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-}*/
-
-
-
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
 #endif
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index dfd6560d..9f630e8d 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -95,15 +95,9 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
     DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
-//    caffe_gpu_rng_uniform(count, mask);
      caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
      DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #endif
-    // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-//    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-  //      count, bottom_data, mask, uint_thres_, scale_, top_data);
-   // CUDA_POST_KERNEL_CHECK;
   } else {
     caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data);
   }
@@ -118,14 +112,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* top_diff = top[0]->gpu_diff();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     if (this->phase_ == TRAIN) {
-      //const unsigned int* mask =
-        //  static_cast<const unsigned int*>(rand_vec_.gpu_data());
       const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-     // DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-       // CAFFE_CUDA_NUM_THREADS>>>(
-         // count, top_diff, mask, uint_thres_, scale_, bottom_diff);
-    //  CUDA_POST_KERNEL_CHECK;
        DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
     } else {
       caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 83b18c89..ff86400b 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -333,25 +333,12 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
         height_, width_, pooled_height_, pooled_width_, kernel_h_,
         kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
         mask, top_mask);
-   /* 
-   // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);*/
     break;
  case PoolingParameter_PoolMethod_AVE:
     // NOLINT_NEXT_LINE(whitespace/operators)
     AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
         height_, width_, pooled_height_, pooled_width_, kernel_h_,
         kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
- /*
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);*/
     break;
   case PoolingParameter_PoolMethod_STOCHASTIC:
     if (this->phase_ == TRAIN) {
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index c38814f1..784d2c91 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -43,15 +43,6 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_gpu_data();
   const int count = bottom[0]->count();
   Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  // NOLINT_NEXT_LINE(whitespace/operators)
- // ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-   //   count, bottom_data, top_data, negative_slope);
-  //CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
  ReLUForward(count,bottom_data,top_data,negative_slope);
 }
 
@@ -66,10 +57,6 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     const int count = bottom[0]->count();
     Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-//    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-  //      count, top_diff, bottom_data, bottom_diff, negative_slope);
-   // CUDA_POST_KERNEL_CHECK;
    ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope);
   }
 }
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index c127d56b..cc7dc79d 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -418,39 +418,6 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
       // get window label
       top_label[item_id] = window[WindowDataLayer<Dtype>::LABEL];
 
-      #if 0
-      // useful debugging code for dumping transformed windows to disk
-      string file_id;
-      std::stringstream ss;
-      ss << PrefetchRand();
-      ss >> file_id;
-      std::ofstream inf((string("dump/") + file_id +
-          string("_info.txt")).c_str(), std::ofstream::out);
-      inf << image.first << std::endl
-          << window[WindowDataLayer<Dtype>::X1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::X2]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y2]+1 << std::endl
-          << do_mirror << std::endl
-          << top_label[item_id] << std::endl
-          << is_fg << std::endl;
-      inf.close();
-      std::ofstream top_data_file((string("dump/") + file_id +
-          string("_data.txt")).c_str(),
-          std::ofstream::out | std::ofstream::binary);
-      for (int c = 0; c < channels; ++c) {
-        for (int h = 0; h < crop_size; ++h) {
-          for (int w = 0; w < crop_size; ++w) {
-            top_data_file.write(reinterpret_cast<char*>(
-                &top_data[((item_id * channels + c) * crop_size + h)
-                          * crop_size + w]),
-                sizeof(Dtype));
-          }
-        }
-      }
-      top_data_file.close();
-      #endif
-
       item_id++;
     }
   }
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index f5d0e703..e070d774 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -35,7 +35,6 @@ Net<Dtype>::Net(const string& param_file, Phase phase) {
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
   // Set phase from the state.
-  //amdDevice.Init();
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
   // the current NetState.
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index dde98baf..6e1a40a7 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -27,15 +27,6 @@ void Solver<Dtype>::ocl_setup(){
    powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
 
-//template <typename Dtype>
-/*Solver<Dtype>::~Solver(){
-    OCL_CHECK( clReleaseKernel(scalar_kernel) );
-    OCL_CHECK( clReleaseKernel(add_kernel) );
-    OCL_CHECK( clReleaseKernel(div_kernel) );
-    OCL_CHECK( clReleaseKernel(powx_kernel) );
-}
-*/
-
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
     : net_() {
@@ -51,14 +42,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
 
-//#ifndef CPU_ONLY
-  //AMD device related initialization
-  //amdDevice.Init();
   ocl_setup();
-//  cl_int err =  clblasSetup();
-//#else
-//  NO_GPU;
-//#endif
 
   if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 6942f8a3..0383fd27 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -13,14 +13,6 @@ Timer::Timer()
 }
 
 Timer::~Timer() {
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-   // CUDA_CHECK(cudaEventDestroy(start_gpu_));
-   // CUDA_CHECK(cudaEventDestroy(stop_gpu_));
-#else
-    NO_GPU;
-#endif
-  }
 }
 
 void Timer::Start() {
@@ -72,12 +64,6 @@ float Timer::Seconds() {
 void Timer::Init() {
   if (!initted()) {
     if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-     // CUDA_CHECK(cudaEventCreate(&start_gpu_));
-     // CUDA_CHECK(cudaEventCreate(&stop_gpu_));
-#else
-      NO_GPU;
-#endif
     }
     initted_ = true;
   }
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index c90f93eb..d52acb54 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -88,16 +88,6 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     int w_col_end = min(w / stride_w + 1, width_col);
     int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
     int h_col_end = min(h / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-      }
-    }
-    */
     // equivalent implementation
     int offset =
         (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 63b449da..57fc9fd4 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -540,8 +540,6 @@ double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
 template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
     float* out) {
-  //need to pass in scratchBuff
-  //AMDBLAS_CHECK(clAmdBlasSdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 6294cce3..ccaf60df 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1058,69 +1058,6 @@ template void DropoutBackward<double>(const int count, const double* top_diff, c
 template <typename Dtype>
 void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
 {
-/*        std::string kernel_name = "Conv" + get_dtype_suffix<Dtype>();
-        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-
-        int weights_stride = kernel_w * kernel_h;//correct?
-        int bot_stride = width;
-        int bot_channel_stride = width * height; 
-        int bot_batch_stride = width * height * channel_in;
-
-        int top_stride = width_out;
-        int top_channel_stride = width_out * height_out;
-        int top_batch_stride = width_out * height_out * channel_out;
-
-        //int height_out = (int)top->getDim(ANN_TENSOR_HEIGHT);
-        //int width_out = (int)top->getDim(ANN_TENSOR_WIDTH);
-        int vis_height = height_out * stride - 2 * pad;
-        int vis_width = width_out * stride - 2 * pad;
-
-        int ocl_group_sz0_ = 8;
-        int ocl_group_sz1_ = 8;
-        int ocl_group_lg2sz1_ = (int)ceil(log((double)ocl_group_sz1_)/log(2.));
-        int ocl_group_lg2sz0_ = (int)ceil(log((double)ocl_group_sz0_)/log(2.));
-        
-        int outputs = channel_out;
-        int n_out_pix_horiz_ = (width_out < 2 * ocl_group_sz0_) ? 1 : (width_out < 4 * ocl_group_sz0_) ? 2 : 4;
-        int n_out_pix_vert_ = (height_out < 2 * ocl_group_sz1_) ? 1 : 2; // (height_out <= 192) ? 2 : 4;
-        int n_outs_ = ((outputs & 1) == 1) ? 1 : (kernel_w == 3) && ((outputs / 4) * 4 == outputs) ? 4 : 2; // (n_out_pix_horiz_ >= 4) ? 1 : 2;
-
-        int n_outputs = channel_out;
-        n_outputs /= n_outs_;
-        int i_n_group_horiz = (width_out + ocl_group_sz0_ * n_out_pix_horiz_ - 1) / (ocl_group_sz0_ * n_out_pix_horiz_);
-        int i_n_group_vert = (height_out + ocl_group_sz1_ * n_out_pix_vert_ - 1) / (ocl_group_sz1_ * n_out_pix_vert_);
-
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&bottom_data);
-        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&weights);
-        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&bias);
-        ret |= clSetKernelArg(ker_rand, 0, sizeof(cl_mem),   (void*)&top_data);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(cl_int),   (void*)&kernel_w);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&channel_out);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&channel_in);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&pad);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_out_pix_horiz_);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_out_pix_vert_);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_batch_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_channel_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&bot_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_batch_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_channel_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&top_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&vis_width);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&vis_height);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&weights_stride);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&width_out);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&height_out);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_int),   (void*)&n_outs_);
-        OCL_CHECK(ret);
-
-         size_t l_wk[3] = { ocl_group_sz0_, ocl_group_sz1_, 1};
-	 size_t g_wk[3] = { i_n_group_horiz * l_wk[0], i_n_group_vert * l_wk[1], batch_sz * n_outputs };
-
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );*/
 }
 template void ocl_conv<float>(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
 template void ocl_conv<double>(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);

From dce5407594dbf06b7b99b1dfa5a00eed9ea99352 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 4 Sep 2015 23:00:30 +0800
Subject: [PATCH 055/124] Partially get through unit test

---
 include/caffe/common.hpp                    |  2 +-
 include/caffe/test/test_caffe_main.hpp      |  2 +-
 include/caffe/util/ocl_wrapper.hpp          |  7 ++--
 src/caffe/layers/dropout_layer.cpp          |  4 +--
 src/caffe/layers/exp_layer.cpp              | 21 +++++++++++
 src/caffe/layers/split_layer.cpp            |  2 +-
 src/caffe/ocl/pooling_layer.cl              |  2 +-
 src/caffe/ocl/util.cl                       | 11 +++++-
 src/caffe/solver.cpp                        |  2 +-
 src/caffe/test/test_caffe_main.cpp          | 13 +++----
 src/caffe/test/test_common.cpp              |  7 ++--
 src/caffe/test/test_filter_layer.cpp        |  4 +--
 src/caffe/test/test_inner_product_layer.cpp | 14 ++++----
 src/caffe/test/test_platform.cpp            |  6 ++--
 src/caffe/test/test_util_blas.cpp           |  8 +++--
 src/caffe/util/math_functions.cpp           | 30 +++++++++++-----
 src/caffe/util/ocl_wrapper.cpp              | 39 ++++++++++++++++-----
 17 files changed, 123 insertions(+), 51 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index c5bf909d..8c738ca3 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -81,7 +81,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1 
+#define use_packing_scheme 0 
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091..b4f8f284 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -17,7 +17,7 @@ using std::endl;
 #ifdef CMAKE_BUILD
   #include "caffe_config.h"
 #else
-  #define CUDA_TEST_DEVICE -1
+  #define OPENCL_TEST_DEVICE -1
   #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
   #define CMAKE_EXT ""
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 223e3278..6a019895 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -134,7 +134,10 @@ template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
 template <typename Dtype>
-void caffe_gpu_sign(cl_kernel Kernel,const int N, const Dtype* X, Dtype * Y );
+void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y );
+
+template <typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y );
 
 template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
@@ -174,7 +177,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
           const int ignore_label_, Dtype* counts);
 
 template <typename Dtype>
-void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
 
 template <typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index dfd6560d..3a060388 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -17,7 +17,7 @@ void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
 
 template <typename Dtype>
 DropoutLayer<Dtype>::~DropoutLayer(){
-   OCL_CHECK( clReleaseMemObject(MaskMem) );
+//   OCL_CHECK( clReleaseMemObject(MaskMem) );
 }
 
 
@@ -105,7 +105,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   //      count, bottom_data, mask, uint_thres_, scale_, top_data);
    // CUDA_POST_KERNEL_CHECK;
   } else {
-    caffe_gpu_copy(count*sizeof(Dtype), bottom_data, top_data);
+    caffe_gpu_copy(count, bottom_data, top_data);
   }
 }
 
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 547fca6a..5e7819c0 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -62,11 +62,32 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (inner_scale_ == Dtype(1)) {
+    caffe_gpu_exp(count, bottom_data, top_data);
+  } else {
+    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
+    caffe_gpu_exp(count, top_data, top_data);
+  }
+  if (outer_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, outer_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (!propagate_down[0]) { return; }
+  const int count = bottom[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
+  if (inner_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, inner_scale_, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index af8a9123..4b60db10 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -66,7 +66,7 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
     return;
   }
-  caffe_gpu_add(gpu_add_kernel, count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
+  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
                 bottom[0]->mutable_gpu_diff());
   // Add remaining top blob diffs.
   for (int i = 2; i < top.size(); ++i) {
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index b6a5a0a1..10d3b9f5 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -245,7 +245,7 @@ template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AveP
 template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class Dtype>
-void StoPoolBackward(const int nthreads,
+__kernel void StoPoolBackward(const int nthreads,
     __global Dtype* rand_idx, __global Dtype* top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index 9710a343..cda05652 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -49,13 +49,22 @@ template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
      int gdx = get_global_id(0);
      if(gdx < N){
-          Y[gdx] =((0.0<X[gdx])-(X[gdx]<0.0));
+          Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
      }
 }
 
 template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
 template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
 
+template <class T>
+__kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) {
+    int index = get_global_id(0);
+    if(index < n) {
+        y[index] = fabs(a[index]);
+    }
+}
+template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y);
 
 template <class T>
 __kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index dde98baf..643e696c 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -780,7 +780,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->update_[param_id]->mutable_gpu_data());
 
     // update history
-    caffe_gpu_add(add_kernel, net_params[param_id]->count(),
+    caffe_gpu_add(net_params[param_id]->count(),
         this->update_[param_id]->gpu_data(),
         this->history_[param_id]->gpu_data(),
         this->history_[param_id]->mutable_gpu_data());
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 5f41d325..278d520c 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -7,12 +7,12 @@
 
 namespace caffe {
 #ifndef CPU_ONLY
-  cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+  //cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 #endif
 }
 
 #ifndef CPU_ONLY
-using caffe::CAFFE_TEST_CUDA_PROP;
+//using caffe::CAFFE_TEST_CUDA_PROP;
 
 #endif
 
@@ -23,15 +23,16 @@ int main(int argc, char** argv) {
   // Before starting testing, let's first print out a few cuda defice info.
   int device = 0;
 //  cudaGetDeviceCount(&device);
-  cout << "Cuda number of devices: " << device << endl;
+ // cout << "Cuda number of devices: " << device << endl;
   if (argc > 1) {
     // Use the given device
     device = atoi(argv[1]);
-    cudaSetDevice(device);
+   // cudaSetDevice(device);
+    caffe::amdDevice.Init(device);
     cout << "Setting to use device " << device << endl;
-  } else if (CUDA_TEST_DEVICE >= 0) {
+  } else if (OPENCL_TEST_DEVICE >= 0) {
     // Use the device assigned in build configuration; but with a lower priority
-    device = CUDA_TEST_DEVICE;
+    device = OPENCL_TEST_DEVICE;
   }
 //  cudaGetDevice(&device);
   cout << "Current device id: " << device << endl;
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index b3a61b0f..6c80de1d 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -14,12 +14,13 @@ class CommonTest : public ::testing::Test {};
 
 #ifndef CPU_ONLY  // GPU Caffe singleton test.
 
+/*
 TEST_F(CommonTest, TestCublasHandlerGPU) {
   int cuda_device_id;
   CUDA_CHECK(cudaGetDevice(&cuda_device_id));
   EXPECT_TRUE(Caffe::cublas_handle());
 }
-
+*/
 #endif
 
 TEST_F(CommonTest, TestBrewMode) {
@@ -45,7 +46,7 @@ TEST_F(CommonTest, TestRandSeedCPU) {
 }
 
 #ifndef CPU_ONLY  // GPU Caffe singleton test.
-
+/*
 TEST_F(CommonTest, TestRandSeedGPU) {
   SyncedMemory data_a(10 * sizeof(unsigned int));
   SyncedMemory data_b(10 * sizeof(unsigned int));
@@ -60,7 +61,7 @@ TEST_F(CommonTest, TestRandSeedGPU) {
         ((const unsigned int*)(data_b.cpu_data()))[i]);
   }
 }
-
+*/
 #endif
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_filter_layer.cpp b/src/caffe/test/test_filter_layer.cpp
index c641b6ef..801881e9 100644
--- a/src/caffe/test/test_filter_layer.cpp
+++ b/src/caffe/test/test_filter_layer.cpp
@@ -13,7 +13,7 @@
 #include "caffe/test/test_gradient_check_util.hpp"
 
 namespace caffe {
-
+/*
 template <typename TypeParam>
 class FilterLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -124,5 +124,5 @@ TYPED_TEST(FilterLayerTest, TestGradient) {
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
-
+*/
 }  // namespace caffe
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index c03df173..f0c36b13 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -13,9 +13,9 @@
 
 namespace caffe {
 
-#ifndef CPU_ONLY
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
+//#ifndef CPU_ONLY
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//#endif
 
 template <typename TypeParam>
 class InnerProductLayerTest : public MultiDeviceTest<TypeParam> {
@@ -59,10 +59,10 @@ TYPED_TEST(InnerProductLayerTest, TestForward) {
   typedef typename TypeParam::Dtype Dtype;
   bool IS_VALID_CUDA = false;
 #ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+ // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
 #endif
   if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+      sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;
     InnerProductParameter* inner_product_param =
         layer_param.mutable_inner_product_param();
@@ -89,10 +89,10 @@ TYPED_TEST(InnerProductLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   bool IS_VALID_CUDA = false;
 #ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+ // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
 #endif
   if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+      sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;
     InnerProductParameter* inner_product_param =
         layer_param.mutable_inner_product_param();
diff --git a/src/caffe/test/test_platform.cpp b/src/caffe/test/test_platform.cpp
index f3513e08..7a30c2db 100644
--- a/src/caffe/test/test_platform.cpp
+++ b/src/caffe/test/test_platform.cpp
@@ -10,10 +10,10 @@
 
 namespace caffe {
 
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 class PlatformTest : public ::testing::Test {};
-
+/*
 TEST_F(PlatformTest, TestInitialization) {
   printf("Major revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.major);
   printf("Minor revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.minor);
@@ -51,7 +51,7 @@ TEST_F(PlatformTest, TestInitialization) {
          (CAFFE_TEST_CUDA_PROP.unifiedAddressing ? "Yes" : "No"));
   EXPECT_TRUE(true);
 }
-
+*/
 }  // namespace caffe
 
 #endif  // CPU_ONLY
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 8770f309..9cc9558c 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -12,7 +12,7 @@
 
 namespace caffe {
 
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 template <typename TypeParam>
 class GemmTest : public ::testing::Test {};
@@ -30,7 +30,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) {
   caffe_copy(6, data, A.mutable_cpu_data());
   caffe_copy(12, data, B.mutable_cpu_data());
 
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+ // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+   if (sizeof(TypeParam) == 4) {
     // [1, 2, 3; 4 5 6] * [1, 2, 3, 4; 5, 6, 7, 8; 9, 10, 11, 12];
     caffe_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
         A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
@@ -100,7 +101,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) {
   caffe_copy(6, data, A.mutable_cpu_data());
   caffe_copy(3, data, x.mutable_cpu_data());
 
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+ // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+  if (sizeof(TypeParam) == 4) {
     caffe_cpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.cpu_data(),
         x.cpu_data(), 0., y.mutable_cpu_data());
     for (int i = 0; i < 2; ++i) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 63b449da..fef8aa34 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -227,11 +227,13 @@ void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y)
 template<>
 void caffe_gpu_abs<float>(const int n, const float* x, float* y)
 {
+    caffe_gpu_abs_ocl(n, x, y);
 }
 
 template<>
 void caffe_gpu_abs<double>(const int n, const double* x, double* y)
 {
+    caffe_gpu_abs_ocl(n, x, y);
 }
 
 template <>
@@ -288,14 +290,16 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
 
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-  if(X != Y)
+  if(X != Y){
       CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+  }
 }
 
 template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-  if(X != Y)
+  if(X != Y){
       CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+  }
 }
 
 template <>
@@ -622,11 +626,15 @@ void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
 template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
                             float* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
 }
 
 template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
                              double* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
 }
 
 template <typename Dtype>
@@ -664,18 +672,24 @@ void mul_kernel(const int n, const Dtype* a,
     const Dtype* b, Dtype* y) {
 }
 
+template <>
+void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
+    kernel_exp(N, a, y);
+}
+
+template <>
+void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
+    kernel_exp(N, a, y);
+}
+
 template<>
 void caffe_gpu_sign<float>(const int N, const float *X, float *Y){
-   cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
-   caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
-   clReleaseKernel(caffe_gpu_sign_kernel);  
+   caffe_gpu_sign_ocl(N, X, Y);
 }
 
 template<>
 void caffe_gpu_sign<double>(const int N, const double *X, double *Y){
-   cl_kernel caffe_gpu_sign_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_sign", NULL);
-   caffe_gpu_sign(caffe_gpu_sign_kernel, N, X, Y);
-   clReleaseKernel(caffe_gpu_sign_kernel);
+   caffe_gpu_sign_ocl(N, X, Y);
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 6294cce3..757a485b 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -50,9 +50,9 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dty
         cl_int ret;
         ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
         ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_float),   (void*)&inf);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_float),   (void*)&sup);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_float),   (void*)&threshold);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype),   (void*)&threshold);
         ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&nrounds);
         ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint),    (void*)&size);
         OCL_CHECK(ret);
@@ -909,7 +909,9 @@ template void LRNComputeDiff<double>(cl_kernel kernel, const int nthreads,
     const double cache_ratio, double* const bottom_diff);
 
 template <typename Dtype>
-void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+    std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1);
@@ -921,11 +923,30 @@ void caffe_gpu_add(cl_kernel Kernel, const int n, const Dtype* in1, const Dtype*
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_add<float> (cl_kernel Kernel, const int n, const float* in1, const float* in2, float* y);
-template void caffe_gpu_add<double> (cl_kernel Kernel, const int n, const double* in1, const double* in2, double* y);
+template void caffe_gpu_add<float> (const int n, const float* in1, const float* in2, float* y);
+template void caffe_gpu_add<double> (const int n, const double* in1, const double* in2, double* y);
 
 template <typename Dtype>
-void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
+void caffe_gpu_sign_ocl(const int N,  const Dtype* X, Dtype * Y ){
+    std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)N};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_ocl<float>(const int N,  const float* X, float* Y );
+template void caffe_gpu_sign_ocl<double>(const int N,  const double* X, double* Y );
+
+template <typename Dtype>
+void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y ){
+    std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
@@ -936,8 +957,8 @@ void caffe_gpu_sign(cl_kernel Kernel,const int N,  const Dtype* X, Dtype * Y ){
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void caffe_gpu_sign<float>(cl_kernel Kernel,const int N,  const float* X, float* Y );
-template void caffe_gpu_sign<double>(cl_kernel Kernel,const int N,  const double* X, double* Y );
+template void caffe_gpu_abs_ocl<float>(const int N,  const float* X, float* Y );
+template void caffe_gpu_abs_ocl<double>(const int N,  const double* X, double* Y );
 
 template <typename Dtype>
 void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){

From c9b345ffae6ee09aed7f56b15920cd6646b0c4f2 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sat, 5 Sep 2015 00:28:41 +0800
Subject: [PATCH 056/124] fixed the random seed

---
 src/caffe/common.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 3e4e0dc0..e12c48c9 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -12,6 +12,7 @@ shared_ptr<Caffe> Caffe::singleton_;
 // random seeding
 int64_t cluster_seedgen(void) {
  //To fix: for now we use fixed seed to get same result each time
+/*
   int64_t s, seed, pid;
   FILE* f = fopen("/dev/urandom", "rb");
   if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
@@ -29,7 +30,8 @@ int64_t cluster_seedgen(void) {
   seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
   //return seed;
   LOG(WARNING) << "return fixed seed 37";
-  return 37;
+*/ 
+ return 37;
 }
 
 
@@ -91,18 +93,14 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
 {
-#ifndef CPU_ONLY
    cl_int err =  clblasSetup();
    if(err != CL_SUCCESS){
        LOG(ERROR) << "clBLAS setup failed "<<err;
    }
-#endif
 }
 
 Caffe::~Caffe() {
-#ifndef CPU_ONLY
    clblasTeardown();
-#endif
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {

From fd4441c425a77bd9988d7ce4f9a680bd0b4f3708 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 6 Sep 2015 14:00:34 +0800
Subject: [PATCH 057/124] Clean up the last two warnings

---
 include/caffe/device.hpp                      |    2 +-
 ...SEARCH.yugao.log.INFO.20150906-133002.7951 | 1250 +++++++++++++++++
 ...SEARCH.yugao.log.INFO.20150906-133358.8300 | 1208 ++++++++++++++++
 ...SEARCH.yugao.log.INFO.20150906-133437.8316 | 1208 ++++++++++++++++
 ...EARCH.yugao.log.INFO.20150906-135805.16515 | 1160 +++++++++++++++
 ...EARCH.yugao.log.INFO.20150906-135855.16537 | 1208 ++++++++++++++++
 log/caffe.INFO                                |    1 +
 src/caffe/device.cpp                          |    4 +-
 src/caffe/solver.cpp                          |    2 +-
 src/caffe/syncedmem.cpp                       |   35 +-
 10 files changed, 6044 insertions(+), 34 deletions(-)
 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
 create mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
 create mode 120000 log/caffe.INFO

diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 697e2391..3806eeb6 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -72,7 +72,7 @@ class Device{
     cl_kernel GetKernel(std::string kernel_name);    
     void ReleaseKernels();
 };
-extern char* buildOption;
+extern std::string buildOption;
 extern Device amdDevice;
 
 }  // namespace caffe
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
new file mode 100644
index 00000000..c75e1aaa
--- /dev/null
+++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
@@ -0,0 +1,1250 @@
+Log file created at: 2015/09/06 13:30:02
+Running on machine: AMD-RESEARCH
+Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
+I0906 13:30:02.150327  7951 caffe.cpp:114] Use GPU with device ID 0
+I0906 13:30:02.187862  7951 device.cpp:230] Number of platforms found:1
+I0906 13:30:02.187903  7951 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
+I0906 13:30:02.187918  7951 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
+I0906 13:30:02.187973  7951 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
+I0906 13:30:02.187980  7951 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
+I0906 13:30:02.187991  7951 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
+I0906 13:30:02.188000  7951 device.cpp:286] Number of devices found:1
+I0906 13:30:02.188005  7951 device.cpp:288] 	DeviceID:	0x2171230
+I0906 13:30:02.188025  7951 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
+I0906 13:30:02.188033  7951 device.cpp:393] 	Is it integrated GPU?:	0
+I0906 13:30:02.188038  7951 device.cpp:393] 	Max clock frequency MHz:	930
+I0906 13:30:02.188043  7951 device.cpp:393] 	Host-Device unified mem:	0
+I0906 13:30:02.188048  7951 device.cpp:393] 	ECC support:	0
+I0906 13:30:02.188052  7951 device.cpp:393] 	Endian little:	1
+I0906 13:30:02.188056  7951 device.cpp:393] 	Max compute units:	44
+I0906 13:30:02.188061  7951 device.cpp:393] 	Max work group size:	256
+I0906 13:30:02.188066  7951 device.cpp:393] 	Max work item dimensions:	3
+I0906 13:30:02.188072  7951 device.cpp:393] 	Max work item sizes:	0x100
+I0906 13:30:02.188078  7951 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
+I0906 13:30:02.188083  7951 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
+I0906 13:30:02.188088  7951 device.cpp:393] 	Max mem alloc size:	4244635648
+I0906 13:30:02.188092  7951 device.cpp:393] 	Global mem size:	16878927872
+I0906 13:30:02.188097  7951 device.cpp:393] 	Local mem size:	32768
+I0906 13:30:02.188107  7951 device.cpp:96] Picked device type : GPU 0
+I0906 13:30:04.630481  7951 device.cpp:152] Build Program
+I0906 13:30:04.630708  7951 caffe.cpp:122] Starting Optimization
+I0906 13:30:04.630797  7951 solver.cpp:40] Initializing solver from parameters: 
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+display: 1
+max_iter: 450000
+lr_policy: "step"
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+stepsize: 100000
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
+net: "models/bvlc_alexnet/train_val.prototxt"
+I0906 13:30:04.630909  7951 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val.prototxt
+I0906 13:30:04.632081  7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
+I0906 13:30:04.632134  7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
+I0906 13:30:04.632319  7951 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TRAIN
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 256
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:30:04.632813  7951 net.cpp:68] Memory required for data: 0
+I0906 13:30:04.632977  7951 layer_factory.hpp:74] Creating layer data
+I0906 13:30:04.633033  7951 net.cpp:91] Creating Layer data
+I0906 13:30:04.633055  7951 net.cpp:369] data -> data
+I0906 13:30:04.633160  7951 net.cpp:369] data -> label
+I0906 13:30:04.633183  7951 net.cpp:121] Setting up data
+I0906 13:30:04.633196  7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:30:04.642779  7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
+I0906 13:30:04.643064  7951 data_layer.cpp:53] output data size: 256,3,227,227
+I0906 13:30:04.723888  7951 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:30:04.724091  7951 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:30:04.724150  7951 net.cpp:128] Top shape: 256 3 227 227 (39574272)
+I0906 13:30:04.724161  7951 net.cpp:128] Top shape: 256 (256)
+I0906 13:30:04.724165  7951 net.cpp:134] Memory required for data: 158298112
+I0906 13:30:04.724201  7951 layer_factory.hpp:74] Creating layer conv1
+I0906 13:30:04.724283  7951 net.cpp:91] Creating Layer conv1
+I0906 13:30:04.724328  7951 net.cpp:411] conv1 <- data
+I0906 13:30:04.724383  7951 net.cpp:369] conv1 -> conv1
+I0906 13:30:04.724417  7951 net.cpp:121] Setting up conv1
+I0906 13:30:04.729287  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
+I0906 13:30:04.729295  7951 net.cpp:134] Memory required for data: 455667712
+I0906 13:30:04.729333  7951 layer_factory.hpp:74] Creating layer relu1
+I0906 13:30:04.729357  7951 net.cpp:91] Creating Layer relu1
+I0906 13:30:04.729362  7951 net.cpp:411] relu1 <- conv1
+I0906 13:30:04.729377  7951 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:30:04.729385  7951 net.cpp:121] Setting up relu1
+I0906 13:30:04.729408  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
+I0906 13:30:04.729411  7951 net.cpp:134] Memory required for data: 753037312
+I0906 13:30:04.729416  7951 layer_factory.hpp:74] Creating layer norm1
+I0906 13:30:04.729444  7951 net.cpp:91] Creating Layer norm1
+I0906 13:30:04.729450  7951 net.cpp:411] norm1 <- conv1
+I0906 13:30:04.729463  7951 net.cpp:369] norm1 -> norm1
+I0906 13:30:04.729476  7951 net.cpp:121] Setting up norm1
+I0906 13:30:04.729499  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
+I0906 13:30:04.729504  7951 net.cpp:134] Memory required for data: 1050406912
+I0906 13:30:04.729509  7951 layer_factory.hpp:74] Creating layer pool1
+I0906 13:30:04.729532  7951 net.cpp:91] Creating Layer pool1
+I0906 13:30:04.729537  7951 net.cpp:411] pool1 <- norm1
+I0906 13:30:04.729550  7951 net.cpp:369] pool1 -> pool1
+I0906 13:30:04.729564  7951 net.cpp:121] Setting up pool1
+I0906 13:30:04.729591  7951 net.cpp:128] Top shape: 256 96 27 27 (17915904)
+I0906 13:30:04.729596  7951 net.cpp:134] Memory required for data: 1122070528
+I0906 13:30:04.729600  7951 layer_factory.hpp:74] Creating layer conv2
+I0906 13:30:04.729614  7951 net.cpp:91] Creating Layer conv2
+I0906 13:30:04.729619  7951 net.cpp:411] conv2 <- pool1
+I0906 13:30:04.729635  7951 net.cpp:369] conv2 -> conv2
+I0906 13:30:04.729647  7951 net.cpp:121] Setting up conv2
+I0906 13:30:04.769634  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
+I0906 13:30:04.769649  7951 net.cpp:134] Memory required for data: 1313173504
+I0906 13:30:04.769673  7951 layer_factory.hpp:74] Creating layer relu2
+I0906 13:30:04.769695  7951 net.cpp:91] Creating Layer relu2
+I0906 13:30:04.769704  7951 net.cpp:411] relu2 <- conv2
+I0906 13:30:04.769722  7951 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:30:04.769736  7951 net.cpp:121] Setting up relu2
+I0906 13:30:04.769744  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
+I0906 13:30:04.769748  7951 net.cpp:134] Memory required for data: 1504276480
+I0906 13:30:04.769752  7951 layer_factory.hpp:74] Creating layer norm2
+I0906 13:30:04.769769  7951 net.cpp:91] Creating Layer norm2
+I0906 13:30:04.769775  7951 net.cpp:411] norm2 <- conv2
+I0906 13:30:04.769788  7951 net.cpp:369] norm2 -> norm2
+I0906 13:30:04.769800  7951 net.cpp:121] Setting up norm2
+I0906 13:30:04.769820  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
+I0906 13:30:04.769825  7951 net.cpp:134] Memory required for data: 1695379456
+I0906 13:30:04.769829  7951 layer_factory.hpp:74] Creating layer pool2
+I0906 13:30:04.769850  7951 net.cpp:91] Creating Layer pool2
+I0906 13:30:04.769856  7951 net.cpp:411] pool2 <- norm2
+I0906 13:30:04.769870  7951 net.cpp:369] pool2 -> pool2
+I0906 13:30:04.769927  7951 net.cpp:121] Setting up pool2
+I0906 13:30:04.769944  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
+I0906 13:30:04.769949  7951 net.cpp:134] Memory required for data: 1739681792
+I0906 13:30:04.769953  7951 layer_factory.hpp:74] Creating layer conv3
+I0906 13:30:04.769975  7951 net.cpp:91] Creating Layer conv3
+I0906 13:30:04.769981  7951 net.cpp:411] conv3 <- pool2
+I0906 13:30:04.769996  7951 net.cpp:369] conv3 -> conv3
+I0906 13:30:04.770010  7951 net.cpp:121] Setting up conv3
+I0906 13:30:04.886401  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
+I0906 13:30:04.886425  7951 net.cpp:134] Memory required for data: 1806135296
+I0906 13:30:04.886471  7951 layer_factory.hpp:74] Creating layer relu3
+I0906 13:30:04.886507  7951 net.cpp:91] Creating Layer relu3
+I0906 13:30:04.886521  7951 net.cpp:411] relu3 <- conv3
+I0906 13:30:04.886548  7951 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:30:04.886565  7951 net.cpp:121] Setting up relu3
+I0906 13:30:04.886575  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
+I0906 13:30:04.886579  7951 net.cpp:134] Memory required for data: 1872588800
+I0906 13:30:04.886584  7951 layer_factory.hpp:74] Creating layer conv4
+I0906 13:30:04.886611  7951 net.cpp:91] Creating Layer conv4
+I0906 13:30:04.886617  7951 net.cpp:411] conv4 <- conv3
+I0906 13:30:04.886633  7951 net.cpp:369] conv4 -> conv4
+I0906 13:30:04.886648  7951 net.cpp:121] Setting up conv4
+I0906 13:30:04.973788  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
+I0906 13:30:04.973810  7951 net.cpp:134] Memory required for data: 1939042304
+I0906 13:30:04.973840  7951 layer_factory.hpp:74] Creating layer relu4
+I0906 13:30:04.973875  7951 net.cpp:91] Creating Layer relu4
+I0906 13:30:04.973891  7951 net.cpp:411] relu4 <- conv4
+I0906 13:30:04.973918  7951 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:30:04.973935  7951 net.cpp:121] Setting up relu4
+I0906 13:30:04.973945  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
+I0906 13:30:04.973949  7951 net.cpp:134] Memory required for data: 2005495808
+I0906 13:30:04.973954  7951 layer_factory.hpp:74] Creating layer conv5
+I0906 13:30:04.973980  7951 net.cpp:91] Creating Layer conv5
+I0906 13:30:04.973986  7951 net.cpp:411] conv5 <- conv4
+I0906 13:30:04.974004  7951 net.cpp:369] conv5 -> conv5
+I0906 13:30:04.974019  7951 net.cpp:121] Setting up conv5
+I0906 13:30:05.032649  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
+I0906 13:30:05.032670  7951 net.cpp:134] Memory required for data: 2049798144
+I0906 13:30:05.032712  7951 layer_factory.hpp:74] Creating layer relu5
+I0906 13:30:05.032747  7951 net.cpp:91] Creating Layer relu5
+I0906 13:30:05.032763  7951 net.cpp:411] relu5 <- conv5
+I0906 13:30:05.032788  7951 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:30:05.032805  7951 net.cpp:121] Setting up relu5
+I0906 13:30:05.032814  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
+I0906 13:30:05.032819  7951 net.cpp:134] Memory required for data: 2094100480
+I0906 13:30:05.032824  7951 layer_factory.hpp:74] Creating layer pool5
+I0906 13:30:05.032843  7951 net.cpp:91] Creating Layer pool5
+I0906 13:30:05.032850  7951 net.cpp:411] pool5 <- conv5
+I0906 13:30:05.032863  7951 net.cpp:369] pool5 -> pool5
+I0906 13:30:05.032877  7951 net.cpp:121] Setting up pool5
+I0906 13:30:05.032897  7951 net.cpp:128] Top shape: 256 256 6 6 (2359296)
+I0906 13:30:05.032902  7951 net.cpp:134] Memory required for data: 2103537664
+I0906 13:30:05.032907  7951 layer_factory.hpp:74] Creating layer fc6
+I0906 13:30:05.032945  7951 net.cpp:91] Creating Layer fc6
+I0906 13:30:05.032951  7951 net.cpp:411] fc6 <- pool5
+I0906 13:30:05.032966  7951 net.cpp:369] fc6 -> fc6
+I0906 13:30:05.032980  7951 net.cpp:121] Setting up fc6
+I0906 13:30:05.203193  7955 data_layer.cpp:120] Prefetch batch: 478 ms.
+I0906 13:30:05.203241  7955 data_layer.cpp:121]      Read time: 65.301 ms.
+I0906 13:30:05.203250  7955 data_layer.cpp:122] Transform time: 409.394 ms.
+I0906 13:30:09.817406  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:09.817432  7951 net.cpp:134] Memory required for data: 2107731968
+I0906 13:30:09.817504  7951 layer_factory.hpp:74] Creating layer relu6
+I0906 13:30:09.817538  7951 net.cpp:91] Creating Layer relu6
+I0906 13:30:09.817553  7951 net.cpp:411] relu6 <- fc6
+I0906 13:30:09.817579  7951 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:30:09.817595  7951 net.cpp:121] Setting up relu6
+I0906 13:30:09.817605  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:09.817608  7951 net.cpp:134] Memory required for data: 2111926272
+I0906 13:30:09.817613  7951 layer_factory.hpp:74] Creating layer drop6
+I0906 13:30:09.817643  7951 net.cpp:91] Creating Layer drop6
+I0906 13:30:09.817649  7951 net.cpp:411] drop6 <- fc6
+I0906 13:30:09.817662  7951 net.cpp:358] drop6 -> fc6 (in-place)
+I0906 13:30:09.817672  7951 net.cpp:121] Setting up drop6
+I0906 13:30:09.817692  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:09.817695  7951 net.cpp:134] Memory required for data: 2116120576
+I0906 13:30:09.817700  7951 layer_factory.hpp:74] Creating layer fc7
+I0906 13:30:09.817721  7951 net.cpp:91] Creating Layer fc7
+I0906 13:30:09.817728  7951 net.cpp:411] fc7 <- fc6
+I0906 13:30:09.817744  7951 net.cpp:369] fc7 -> fc7
+I0906 13:30:09.817759  7951 net.cpp:121] Setting up fc7
+I0906 13:30:11.938176  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:11.938201  7951 net.cpp:134] Memory required for data: 2120314880
+I0906 13:30:11.938230  7951 layer_factory.hpp:74] Creating layer relu7
+I0906 13:30:11.938263  7951 net.cpp:91] Creating Layer relu7
+I0906 13:30:11.938278  7951 net.cpp:411] relu7 <- fc7
+I0906 13:30:11.938305  7951 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:30:11.938321  7951 net.cpp:121] Setting up relu7
+I0906 13:30:11.938330  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:11.938334  7951 net.cpp:134] Memory required for data: 2124509184
+I0906 13:30:11.938339  7951 layer_factory.hpp:74] Creating layer drop7
+I0906 13:30:11.938355  7951 net.cpp:91] Creating Layer drop7
+I0906 13:30:11.938360  7951 net.cpp:411] drop7 <- fc7
+I0906 13:30:11.938372  7951 net.cpp:358] drop7 -> fc7 (in-place)
+I0906 13:30:11.938382  7951 net.cpp:121] Setting up drop7
+I0906 13:30:11.938397  7951 net.cpp:128] Top shape: 256 4096 (1048576)
+I0906 13:30:11.938401  7951 net.cpp:134] Memory required for data: 2128703488
+I0906 13:30:11.938406  7951 layer_factory.hpp:74] Creating layer fc8
+I0906 13:30:11.938427  7951 net.cpp:91] Creating Layer fc8
+I0906 13:30:11.938433  7951 net.cpp:411] fc8 <- fc7
+I0906 13:30:11.938449  7951 net.cpp:369] fc8 -> fc8
+I0906 13:30:11.938464  7951 net.cpp:121] Setting up fc8
+I0906 13:30:12.468230  7951 net.cpp:128] Top shape: 256 1000 (256000)
+I0906 13:30:12.468251  7951 net.cpp:134] Memory required for data: 2129727488
+I0906 13:30:12.468279  7951 layer_factory.hpp:74] Creating layer loss
+I0906 13:30:12.468333  7951 net.cpp:91] Creating Layer loss
+I0906 13:30:12.468348  7951 net.cpp:411] loss <- fc8
+I0906 13:30:12.468370  7951 net.cpp:411] loss <- label
+I0906 13:30:12.468389  7951 net.cpp:369] loss -> loss
+I0906 13:30:12.468408  7951 net.cpp:121] Setting up loss
+I0906 13:30:12.468426  7951 layer_factory.hpp:74] Creating layer loss
+I0906 13:30:12.469732  7951 net.cpp:128] Top shape: (1)
+I0906 13:30:12.469740  7951 net.cpp:130]     with loss weight 1
+I0906 13:30:12.469756  7951 net.cpp:134] Memory required for data: 2129727492
+I0906 13:30:12.469769  7951 net.cpp:193] loss needs backward computation.
+I0906 13:30:12.469779  7951 net.cpp:193] fc8 needs backward computation.
+I0906 13:30:12.469784  7951 net.cpp:193] drop7 needs backward computation.
+I0906 13:30:12.469791  7951 net.cpp:193] relu7 needs backward computation.
+I0906 13:30:12.469796  7951 net.cpp:193] fc7 needs backward computation.
+I0906 13:30:12.469808  7951 net.cpp:193] drop6 needs backward computation.
+I0906 13:30:12.469815  7951 net.cpp:193] relu6 needs backward computation.
+I0906 13:30:12.469820  7951 net.cpp:193] fc6 needs backward computation.
+I0906 13:30:12.469825  7951 net.cpp:193] pool5 needs backward computation.
+I0906 13:30:12.469830  7951 net.cpp:193] relu5 needs backward computation.
+I0906 13:30:12.469835  7951 net.cpp:193] conv5 needs backward computation.
+I0906 13:30:12.469882  7951 net.cpp:193] relu4 needs backward computation.
+I0906 13:30:12.469887  7951 net.cpp:193] conv4 needs backward computation.
+I0906 13:30:12.469893  7951 net.cpp:193] relu3 needs backward computation.
+I0906 13:30:12.469899  7951 net.cpp:193] conv3 needs backward computation.
+I0906 13:30:12.469907  7951 net.cpp:193] pool2 needs backward computation.
+I0906 13:30:12.469913  7951 net.cpp:193] norm2 needs backward computation.
+I0906 13:30:12.469918  7951 net.cpp:193] relu2 needs backward computation.
+I0906 13:30:12.469924  7951 net.cpp:193] conv2 needs backward computation.
+I0906 13:30:12.469930  7951 net.cpp:193] pool1 needs backward computation.
+I0906 13:30:12.469936  7951 net.cpp:193] norm1 needs backward computation.
+I0906 13:30:12.469943  7951 net.cpp:193] relu1 needs backward computation.
+I0906 13:30:12.469949  7951 net.cpp:193] conv1 needs backward computation.
+I0906 13:30:12.469955  7951 net.cpp:195] data does not need backward computation.
+I0906 13:30:12.469962  7951 net.cpp:236] This network produces output loss
+I0906 13:30:12.470002  7951 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:30:12.470018  7951 net.cpp:248] Network initialization done.
+I0906 13:30:12.470022  7951 net.cpp:249] Memory required for data: 2129727492
+I0906 13:30:12.470949  7951 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val.prototxt
+I0906 13:30:12.471081  7951 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
+I0906 13:30:12.471318  7951 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TEST
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:30:12.471688  7951 net.cpp:68] Memory required for data: 0
+I0906 13:30:12.471739  7951 layer_factory.hpp:74] Creating layer data
+I0906 13:30:12.471761  7951 net.cpp:91] Creating Layer data
+I0906 13:30:12.471772  7951 net.cpp:369] data -> data
+I0906 13:30:12.471796  7951 net.cpp:369] data -> label
+I0906 13:30:12.471810  7951 net.cpp:121] Setting up data
+I0906 13:30:12.471817  7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:30:12.482815  7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
+I0906 13:30:12.483065  7951 data_layer.cpp:53] output data size: 50,3,227,227
+I0906 13:30:12.546061  7951 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:30:12.546188  7951 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:30:12.546222  7951 net.cpp:128] Top shape: 50 3 227 227 (7729350)
+I0906 13:30:12.546231  7951 net.cpp:128] Top shape: 50 (50)
+I0906 13:30:12.546236  7951 net.cpp:134] Memory required for data: 30917600
+I0906 13:30:12.546268  7951 layer_factory.hpp:74] Creating layer label_data_1_split
+I0906 13:30:12.546334  7951 net.cpp:91] Creating Layer label_data_1_split
+I0906 13:30:12.546380  7951 net.cpp:411] label_data_1_split <- label
+I0906 13:30:12.546419  7951 net.cpp:369] label_data_1_split -> label_data_1_split_0
+I0906 13:30:12.546460  7951 net.cpp:369] label_data_1_split -> label_data_1_split_1
+I0906 13:30:12.546520  7951 net.cpp:121] Setting up label_data_1_split
+I0906 13:30:12.546551  7951 net.cpp:128] Top shape: 50 (50)
+I0906 13:30:12.546558  7951 net.cpp:128] Top shape: 50 (50)
+I0906 13:30:12.546561  7951 net.cpp:134] Memory required for data: 30918000
+I0906 13:30:12.546567  7951 layer_factory.hpp:74] Creating layer conv1
+I0906 13:30:12.546602  7951 net.cpp:91] Creating Layer conv1
+I0906 13:30:12.546608  7951 net.cpp:411] conv1 <- data
+I0906 13:30:12.546624  7951 net.cpp:369] conv1 -> conv1
+I0906 13:30:12.546638  7951 net.cpp:121] Setting up conv1
+I0906 13:30:12.551349  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:30:12.551354  7951 net.cpp:134] Memory required for data: 88998000
+I0906 13:30:12.551374  7951 layer_factory.hpp:74] Creating layer relu1
+I0906 13:30:12.551388  7951 net.cpp:91] Creating Layer relu1
+I0906 13:30:12.551393  7951 net.cpp:411] relu1 <- conv1
+I0906 13:30:12.551405  7951 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:30:12.551415  7951 net.cpp:121] Setting up relu1
+I0906 13:30:12.551422  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:30:12.551426  7951 net.cpp:134] Memory required for data: 147078000
+I0906 13:30:12.551431  7951 layer_factory.hpp:74] Creating layer norm1
+I0906 13:30:12.551451  7951 net.cpp:91] Creating Layer norm1
+I0906 13:30:12.551457  7951 net.cpp:411] norm1 <- conv1
+I0906 13:30:12.551470  7951 net.cpp:369] norm1 -> norm1
+I0906 13:30:12.551481  7951 net.cpp:121] Setting up norm1
+I0906 13:30:12.551499  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:30:12.551504  7951 net.cpp:134] Memory required for data: 205158000
+I0906 13:30:12.551508  7951 layer_factory.hpp:74] Creating layer pool1
+I0906 13:30:12.551524  7951 net.cpp:91] Creating Layer pool1
+I0906 13:30:12.551530  7951 net.cpp:411] pool1 <- norm1
+I0906 13:30:12.551543  7951 net.cpp:369] pool1 -> pool1
+I0906 13:30:12.551553  7951 net.cpp:121] Setting up pool1
+I0906 13:30:12.551571  7951 net.cpp:128] Top shape: 50 96 27 27 (3499200)
+I0906 13:30:12.551576  7951 net.cpp:134] Memory required for data: 219154800
+I0906 13:30:12.551580  7951 layer_factory.hpp:74] Creating layer conv2
+I0906 13:30:12.551594  7951 net.cpp:91] Creating Layer conv2
+I0906 13:30:12.551600  7951 net.cpp:411] conv2 <- pool1
+I0906 13:30:12.551615  7951 net.cpp:369] conv2 -> conv2
+I0906 13:30:12.551627  7951 net.cpp:121] Setting up conv2
+I0906 13:30:12.591382  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:30:12.591404  7951 net.cpp:134] Memory required for data: 256479600
+I0906 13:30:12.591442  7951 layer_factory.hpp:74] Creating layer relu2
+I0906 13:30:12.591473  7951 net.cpp:91] Creating Layer relu2
+I0906 13:30:12.591486  7951 net.cpp:411] relu2 <- conv2
+I0906 13:30:12.591511  7951 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:30:12.591526  7951 net.cpp:121] Setting up relu2
+I0906 13:30:12.591536  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:30:12.591539  7951 net.cpp:134] Memory required for data: 293804400
+I0906 13:30:12.591544  7951 layer_factory.hpp:74] Creating layer norm2
+I0906 13:30:12.591572  7951 net.cpp:91] Creating Layer norm2
+I0906 13:30:12.591578  7951 net.cpp:411] norm2 <- conv2
+I0906 13:30:12.591591  7951 net.cpp:369] norm2 -> norm2
+I0906 13:30:12.591609  7951 net.cpp:121] Setting up norm2
+I0906 13:30:12.591629  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:30:12.591634  7951 net.cpp:134] Memory required for data: 331129200
+I0906 13:30:12.591639  7951 layer_factory.hpp:74] Creating layer pool2
+I0906 13:30:12.591657  7951 net.cpp:91] Creating Layer pool2
+I0906 13:30:12.591663  7951 net.cpp:411] pool2 <- norm2
+I0906 13:30:12.591676  7951 net.cpp:369] pool2 -> pool2
+I0906 13:30:12.591687  7951 net.cpp:121] Setting up pool2
+I0906 13:30:12.591706  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:30:12.591709  7951 net.cpp:134] Memory required for data: 339782000
+I0906 13:30:12.591714  7951 layer_factory.hpp:74] Creating layer conv3
+I0906 13:30:12.591739  7951 net.cpp:91] Creating Layer conv3
+I0906 13:30:12.591744  7951 net.cpp:411] conv3 <- pool2
+I0906 13:30:12.591802  7951 net.cpp:369] conv3 -> conv3
+I0906 13:30:12.591814  7951 net.cpp:121] Setting up conv3
+I0906 13:30:12.640625  7956 data_layer.cpp:120] Prefetch batch: 94 ms.
+I0906 13:30:12.640658  7956 data_layer.cpp:121]      Read time: 12.07 ms.
+I0906 13:30:12.640666  7956 data_layer.cpp:122] Transform time: 81.163 ms.
+I0906 13:30:12.705313  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:30:12.705337  7951 net.cpp:134] Memory required for data: 352761200
+I0906 13:30:12.705377  7951 layer_factory.hpp:74] Creating layer relu3
+I0906 13:30:12.705410  7951 net.cpp:91] Creating Layer relu3
+I0906 13:30:12.705425  7951 net.cpp:411] relu3 <- conv3
+I0906 13:30:12.705451  7951 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:30:12.705466  7951 net.cpp:121] Setting up relu3
+I0906 13:30:12.705476  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:30:12.705479  7951 net.cpp:134] Memory required for data: 365740400
+I0906 13:30:12.705484  7951 layer_factory.hpp:74] Creating layer conv4
+I0906 13:30:12.705512  7951 net.cpp:91] Creating Layer conv4
+I0906 13:30:12.705518  7951 net.cpp:411] conv4 <- conv3
+I0906 13:30:12.705534  7951 net.cpp:369] conv4 -> conv4
+I0906 13:30:12.705549  7951 net.cpp:121] Setting up conv4
+I0906 13:30:12.789549  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:30:12.789571  7951 net.cpp:134] Memory required for data: 378719600
+I0906 13:30:12.789597  7951 layer_factory.hpp:74] Creating layer relu4
+I0906 13:30:12.789631  7951 net.cpp:91] Creating Layer relu4
+I0906 13:30:12.789646  7951 net.cpp:411] relu4 <- conv4
+I0906 13:30:12.789674  7951 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:30:12.789690  7951 net.cpp:121] Setting up relu4
+I0906 13:30:12.789698  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:30:12.789701  7951 net.cpp:134] Memory required for data: 391698800
+I0906 13:30:12.789706  7951 layer_factory.hpp:74] Creating layer conv5
+I0906 13:30:12.789732  7951 net.cpp:91] Creating Layer conv5
+I0906 13:30:12.789738  7951 net.cpp:411] conv5 <- conv4
+I0906 13:30:12.789754  7951 net.cpp:369] conv5 -> conv5
+I0906 13:30:12.789770  7951 net.cpp:121] Setting up conv5
+I0906 13:30:12.846217  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:30:12.846233  7951 net.cpp:134] Memory required for data: 400351600
+I0906 13:30:12.846271  7951 layer_factory.hpp:74] Creating layer relu5
+I0906 13:30:12.846298  7951 net.cpp:91] Creating Layer relu5
+I0906 13:30:12.846312  7951 net.cpp:411] relu5 <- conv5
+I0906 13:30:12.846335  7951 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:30:12.846350  7951 net.cpp:121] Setting up relu5
+I0906 13:30:12.846359  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:30:12.846362  7951 net.cpp:134] Memory required for data: 409004400
+I0906 13:30:12.846367  7951 layer_factory.hpp:74] Creating layer pool5
+I0906 13:30:12.846397  7951 net.cpp:91] Creating Layer pool5
+I0906 13:30:12.846402  7951 net.cpp:411] pool5 <- conv5
+I0906 13:30:12.846417  7951 net.cpp:369] pool5 -> pool5
+I0906 13:30:12.846431  7951 net.cpp:121] Setting up pool5
+I0906 13:30:12.846451  7951 net.cpp:128] Top shape: 50 256 6 6 (460800)
+I0906 13:30:12.846454  7951 net.cpp:134] Memory required for data: 410847600
+I0906 13:30:12.846459  7951 layer_factory.hpp:74] Creating layer fc6
+I0906 13:30:12.846479  7951 net.cpp:91] Creating Layer fc6
+I0906 13:30:12.846485  7951 net.cpp:411] fc6 <- pool5
+I0906 13:30:12.846499  7951 net.cpp:369] fc6 -> fc6
+I0906 13:30:12.846513  7951 net.cpp:121] Setting up fc6
+I0906 13:30:17.661206  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:17.661231  7951 net.cpp:134] Memory required for data: 411666800
+I0906 13:30:17.661259  7951 layer_factory.hpp:74] Creating layer relu6
+I0906 13:30:17.661293  7951 net.cpp:91] Creating Layer relu6
+I0906 13:30:17.661309  7951 net.cpp:411] relu6 <- fc6
+I0906 13:30:17.661334  7951 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:30:17.661350  7951 net.cpp:121] Setting up relu6
+I0906 13:30:17.661360  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:17.661363  7951 net.cpp:134] Memory required for data: 412486000
+I0906 13:30:17.661412  7951 layer_factory.hpp:74] Creating layer drop6
+I0906 13:30:17.661428  7951 net.cpp:91] Creating Layer drop6
+I0906 13:30:17.661434  7951 net.cpp:411] drop6 <- fc6
+I0906 13:30:17.661447  7951 net.cpp:358] drop6 -> fc6 (in-place)
+I0906 13:30:17.661456  7951 net.cpp:121] Setting up drop6
+I0906 13:30:17.661470  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:17.661475  7951 net.cpp:134] Memory required for data: 413305200
+I0906 13:30:17.661480  7951 layer_factory.hpp:74] Creating layer fc7
+I0906 13:30:17.661501  7951 net.cpp:91] Creating Layer fc7
+I0906 13:30:17.661507  7951 net.cpp:411] fc7 <- fc6
+I0906 13:30:17.661523  7951 net.cpp:369] fc7 -> fc7
+I0906 13:30:17.661540  7951 net.cpp:121] Setting up fc7
+I0906 13:30:19.790464  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:19.790488  7951 net.cpp:134] Memory required for data: 414124400
+I0906 13:30:19.790514  7951 layer_factory.hpp:74] Creating layer relu7
+I0906 13:30:19.790547  7951 net.cpp:91] Creating Layer relu7
+I0906 13:30:19.790563  7951 net.cpp:411] relu7 <- fc7
+I0906 13:30:19.790591  7951 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:30:19.790607  7951 net.cpp:121] Setting up relu7
+I0906 13:30:19.790616  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:19.790621  7951 net.cpp:134] Memory required for data: 414943600
+I0906 13:30:19.790624  7951 layer_factory.hpp:74] Creating layer drop7
+I0906 13:30:19.790639  7951 net.cpp:91] Creating Layer drop7
+I0906 13:30:19.790645  7951 net.cpp:411] drop7 <- fc7
+I0906 13:30:19.790657  7951 net.cpp:358] drop7 -> fc7 (in-place)
+I0906 13:30:19.790668  7951 net.cpp:121] Setting up drop7
+I0906 13:30:19.790683  7951 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:30:19.790688  7951 net.cpp:134] Memory required for data: 415762800
+I0906 13:30:19.790691  7951 layer_factory.hpp:74] Creating layer fc8
+I0906 13:30:19.790714  7951 net.cpp:91] Creating Layer fc8
+I0906 13:30:19.790719  7951 net.cpp:411] fc8 <- fc7
+I0906 13:30:19.790735  7951 net.cpp:369] fc8 -> fc8
+I0906 13:30:19.790760  7951 net.cpp:121] Setting up fc8
+I0906 13:30:20.310474  7951 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:30:20.310497  7951 net.cpp:134] Memory required for data: 415962800
+I0906 13:30:20.310523  7951 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
+I0906 13:30:20.310555  7951 net.cpp:91] Creating Layer fc8_fc8_0_split
+I0906 13:30:20.310570  7951 net.cpp:411] fc8_fc8_0_split <- fc8
+I0906 13:30:20.310598  7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
+I0906 13:30:20.310621  7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
+I0906 13:30:20.310633  7951 net.cpp:121] Setting up fc8_fc8_0_split
+I0906 13:30:20.310650  7951 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:30:20.310657  7951 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:30:20.310660  7951 net.cpp:134] Memory required for data: 416362800
+I0906 13:30:20.310665  7951 layer_factory.hpp:74] Creating layer accuracy
+I0906 13:30:20.310698  7951 net.cpp:91] Creating Layer accuracy
+I0906 13:30:20.310704  7951 net.cpp:411] accuracy <- fc8_fc8_0_split_0
+I0906 13:30:20.310715  7951 net.cpp:411] accuracy <- label_data_1_split_0
+I0906 13:30:20.310729  7951 net.cpp:369] accuracy -> accuracy
+I0906 13:30:20.310740  7951 net.cpp:121] Setting up accuracy
+I0906 13:30:20.310756  7951 net.cpp:128] Top shape: (1)
+I0906 13:30:20.310760  7951 net.cpp:134] Memory required for data: 416362804
+I0906 13:30:20.310765  7951 layer_factory.hpp:74] Creating layer loss
+I0906 13:30:20.310777  7951 net.cpp:91] Creating Layer loss
+I0906 13:30:20.310782  7951 net.cpp:411] loss <- fc8_fc8_0_split_1
+I0906 13:30:20.310793  7951 net.cpp:411] loss <- label_data_1_split_1
+I0906 13:30:20.310804  7951 net.cpp:369] loss -> loss
+I0906 13:30:20.310816  7951 net.cpp:121] Setting up loss
+I0906 13:30:20.310825  7951 layer_factory.hpp:74] Creating layer loss
+I0906 13:30:20.311178  7951 net.cpp:128] Top shape: (1)
+I0906 13:30:20.311183  7951 net.cpp:130]     with loss weight 1
+I0906 13:30:20.311200  7951 net.cpp:134] Memory required for data: 416362808
+I0906 13:30:20.311250  7951 net.cpp:193] loss needs backward computation.
+I0906 13:30:20.311259  7951 net.cpp:195] accuracy does not need backward computation.
+I0906 13:30:20.311265  7951 net.cpp:193] fc8_fc8_0_split needs backward computation.
+I0906 13:30:20.311271  7951 net.cpp:193] fc8 needs backward computation.
+I0906 13:30:20.311277  7951 net.cpp:193] drop7 needs backward computation.
+I0906 13:30:20.311282  7951 net.cpp:193] relu7 needs backward computation.
+I0906 13:30:20.311288  7951 net.cpp:193] fc7 needs backward computation.
+I0906 13:30:20.311295  7951 net.cpp:193] drop6 needs backward computation.
+I0906 13:30:20.311300  7951 net.cpp:193] relu6 needs backward computation.
+I0906 13:30:20.311305  7951 net.cpp:193] fc6 needs backward computation.
+I0906 13:30:20.311311  7951 net.cpp:193] pool5 needs backward computation.
+I0906 13:30:20.311317  7951 net.cpp:193] relu5 needs backward computation.
+I0906 13:30:20.311322  7951 net.cpp:193] conv5 needs backward computation.
+I0906 13:30:20.311328  7951 net.cpp:193] relu4 needs backward computation.
+I0906 13:30:20.311333  7951 net.cpp:193] conv4 needs backward computation.
+I0906 13:30:20.311339  7951 net.cpp:193] relu3 needs backward computation.
+I0906 13:30:20.311345  7951 net.cpp:193] conv3 needs backward computation.
+I0906 13:30:20.311352  7951 net.cpp:193] pool2 needs backward computation.
+I0906 13:30:20.311357  7951 net.cpp:193] norm2 needs backward computation.
+I0906 13:30:20.311363  7951 net.cpp:193] relu2 needs backward computation.
+I0906 13:30:20.311368  7951 net.cpp:193] conv2 needs backward computation.
+I0906 13:30:20.311374  7951 net.cpp:193] pool1 needs backward computation.
+I0906 13:30:20.311380  7951 net.cpp:193] norm1 needs backward computation.
+I0906 13:30:20.311386  7951 net.cpp:193] relu1 needs backward computation.
+I0906 13:30:20.311391  7951 net.cpp:193] conv1 needs backward computation.
+I0906 13:30:20.311399  7951 net.cpp:195] label_data_1_split does not need backward computation.
+I0906 13:30:20.311406  7951 net.cpp:195] data does not need backward computation.
+I0906 13:30:20.311411  7951 net.cpp:236] This network produces output accuracy
+I0906 13:30:20.311419  7951 net.cpp:236] This network produces output loss
+I0906 13:30:20.311455  7951 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:30:20.311468  7951 net.cpp:248] Network initialization done.
+I0906 13:30:20.311472  7951 net.cpp:249] Memory required for data: 416362808
+I0906 13:30:20.311663  7951 solver.cpp:53] Solver scaffolding done.
+I0906 13:30:20.311787  7951 solver.cpp:270] Solving AlexNet
+I0906 13:30:20.311791  7951 solver.cpp:271] Learning Rate Policy: step
+I0906 13:30:20.313592  7951 solver.cpp:314] Iteration 0, Testing net (#0)
+I0906 13:30:20.313630  7951 net.cpp:696] Copying source layer data
+I0906 13:30:20.313635  7951 net.cpp:696] Copying source layer conv1
+I0906 13:30:20.316704  7951 net.cpp:696] Copying source layer relu1
+I0906 13:30:20.316743  7951 net.cpp:696] Copying source layer norm1
+I0906 13:30:20.316756  7951 net.cpp:696] Copying source layer pool1
+I0906 13:30:20.316766  7951 net.cpp:696] Copying source layer conv2
+I0906 13:30:20.317158  7951 net.cpp:696] Copying source layer relu2
+I0906 13:30:20.317173  7951 net.cpp:696] Copying source layer norm2
+I0906 13:30:20.317183  7951 net.cpp:696] Copying source layer pool2
+I0906 13:30:20.317193  7951 net.cpp:696] Copying source layer conv3
+I0906 13:30:20.317970  7951 net.cpp:696] Copying source layer relu3
+I0906 13:30:20.317983  7951 net.cpp:696] Copying source layer conv4
+I0906 13:30:20.318357  7951 net.cpp:696] Copying source layer relu4
+I0906 13:30:20.318372  7951 net.cpp:696] Copying source layer conv5
+I0906 13:30:20.318827  7951 net.cpp:696] Copying source layer relu5
+I0906 13:30:20.318840  7951 net.cpp:696] Copying source layer pool5
+I0906 13:30:20.318850  7951 net.cpp:696] Copying source layer fc6
+I0906 13:30:20.336436  7951 net.cpp:696] Copying source layer relu6
+I0906 13:30:20.336460  7951 net.cpp:696] Copying source layer drop6
+I0906 13:30:20.336467  7951 net.cpp:696] Copying sou
\ No newline at end of file
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
new file mode 100644
index 00000000..b99da3d4
--- /dev/null
+++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
@@ -0,0 +1,1208 @@
+Log file created at: 2015/09/06 13:33:58
+Running on machine: AMD-RESEARCH
+Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
+I0906 13:33:58.858449  8300 caffe.cpp:114] Use GPU with device ID 0
+I0906 13:33:58.896994  8300 device.cpp:230] Number of platforms found:1
+I0906 13:33:58.897037  8300 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
+I0906 13:33:58.897054  8300 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
+I0906 13:33:58.897061  8300 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
+I0906 13:33:58.897068  8300 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
+I0906 13:33:58.897075  8300 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
+I0906 13:33:58.897086  8300 device.cpp:286] Number of devices found:1
+I0906 13:33:58.897092  8300 device.cpp:288] 	DeviceID:	0x163a250
+I0906 13:33:58.897126  8300 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
+I0906 13:33:58.897137  8300 device.cpp:393] 	Is it integrated GPU?:	0
+I0906 13:33:58.897145  8300 device.cpp:393] 	Max clock frequency MHz:	930
+I0906 13:33:58.897151  8300 device.cpp:393] 	Host-Device unified mem:	0
+I0906 13:33:58.897157  8300 device.cpp:393] 	ECC support:	0
+I0906 13:33:58.897164  8300 device.cpp:393] 	Endian little:	1
+I0906 13:33:58.897171  8300 device.cpp:393] 	Max compute units:	44
+I0906 13:33:58.897177  8300 device.cpp:393] 	Max work group size:	256
+I0906 13:33:58.897186  8300 device.cpp:393] 	Max work item dimensions:	3
+I0906 13:33:58.897192  8300 device.cpp:393] 	Max work item sizes:	0x100
+I0906 13:33:58.897202  8300 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
+I0906 13:33:58.897209  8300 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
+I0906 13:33:58.897215  8300 device.cpp:393] 	Max mem alloc size:	4244635648
+I0906 13:33:58.897222  8300 device.cpp:393] 	Global mem size:	16878927872
+I0906 13:33:58.897228  8300 device.cpp:393] 	Local mem size:	32768
+I0906 13:33:58.897241  8300 device.cpp:96] Picked device type : GPU 0
+I0906 13:34:01.301823  8300 device.cpp:152] Build Program
+I0906 13:34:01.302049  8300 caffe.cpp:122] Starting Optimization
+I0906 13:34:01.302139  8300 solver.cpp:40] Initializing solver from parameters: 
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+display: 1
+max_iter: 10
+lr_policy: "step"
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+stepsize: 100000
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+I0906 13:34:01.302249  8300 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:34:01.303269  8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
+I0906 13:34:01.303316  8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
+I0906 13:34:01.303493  8300 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TRAIN
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:34:01.303913  8300 net.cpp:68] Memory required for data: 0
+I0906 13:34:01.304132  8300 layer_factory.hpp:74] Creating layer data
+I0906 13:34:01.304185  8300 net.cpp:91] Creating Layer data
+I0906 13:34:01.304205  8300 net.cpp:369] data -> data
+I0906 13:34:01.304306  8300 net.cpp:369] data -> label
+I0906 13:34:01.304328  8300 net.cpp:121] Setting up data
+I0906 13:34:01.304342  8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:34:01.318087  8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
+I0906 13:34:01.318596  8300 data_layer.cpp:53] output data size: 100,3,227,227
+I0906 13:34:01.351816  8300 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:34:01.352555  8300 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:34:01.352643  8300 net.cpp:128] Top shape: 100 3 227 227 (15458700)
+I0906 13:34:01.352655  8300 net.cpp:128] Top shape: 100 (100)
+I0906 13:34:01.352660  8300 net.cpp:134] Memory required for data: 61835200
+I0906 13:34:01.352697  8300 layer_factory.hpp:74] Creating layer conv1
+I0906 13:34:01.352783  8300 net.cpp:91] Creating Layer conv1
+I0906 13:34:01.352808  8300 net.cpp:411] conv1 <- data
+I0906 13:34:01.352902  8300 net.cpp:369] conv1 -> conv1
+I0906 13:34:01.352937  8300 net.cpp:121] Setting up conv1
+I0906 13:34:01.357744  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:01.357751  8300 net.cpp:134] Memory required for data: 177995200
+I0906 13:34:01.357791  8300 layer_factory.hpp:74] Creating layer relu1
+I0906 13:34:01.357815  8300 net.cpp:91] Creating Layer relu1
+I0906 13:34:01.357820  8300 net.cpp:411] relu1 <- conv1
+I0906 13:34:01.357833  8300 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:34:01.357843  8300 net.cpp:121] Setting up relu1
+I0906 13:34:01.357851  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:01.357856  8300 net.cpp:134] Memory required for data: 294155200
+I0906 13:34:01.357861  8300 layer_factory.hpp:74] Creating layer norm1
+I0906 13:34:01.357890  8300 net.cpp:91] Creating Layer norm1
+I0906 13:34:01.357895  8300 net.cpp:411] norm1 <- conv1
+I0906 13:34:01.357908  8300 net.cpp:369] norm1 -> norm1
+I0906 13:34:01.357920  8300 net.cpp:121] Setting up norm1
+I0906 13:34:01.357944  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:01.357949  8300 net.cpp:134] Memory required for data: 410315200
+I0906 13:34:01.357954  8300 layer_factory.hpp:74] Creating layer pool1
+I0906 13:34:01.357978  8300 net.cpp:91] Creating Layer pool1
+I0906 13:34:01.357985  8300 net.cpp:411] pool1 <- norm1
+I0906 13:34:01.357996  8300 net.cpp:369] pool1 -> pool1
+I0906 13:34:01.358010  8300 net.cpp:121] Setting up pool1
+I0906 13:34:01.358038  8300 net.cpp:128] Top shape: 100 96 27 27 (6998400)
+I0906 13:34:01.358042  8300 net.cpp:134] Memory required for data: 438308800
+I0906 13:34:01.358047  8300 layer_factory.hpp:74] Creating layer conv2
+I0906 13:34:01.358060  8300 net.cpp:91] Creating Layer conv2
+I0906 13:34:01.358067  8300 net.cpp:411] conv2 <- pool1
+I0906 13:34:01.358079  8300 net.cpp:369] conv2 -> conv2
+I0906 13:34:01.358091  8300 net.cpp:121] Setting up conv2
+I0906 13:34:01.397493  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:01.397511  8300 net.cpp:134] Memory required for data: 512958400
+I0906 13:34:01.397541  8300 layer_factory.hpp:74] Creating layer relu2
+I0906 13:34:01.397567  8300 net.cpp:91] Creating Layer relu2
+I0906 13:34:01.397578  8300 net.cpp:411] relu2 <- conv2
+I0906 13:34:01.397599  8300 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:34:01.397613  8300 net.cpp:121] Setting up relu2
+I0906 13:34:01.397621  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:01.397626  8300 net.cpp:134] Memory required for data: 587608000
+I0906 13:34:01.397631  8300 layer_factory.hpp:74] Creating layer norm2
+I0906 13:34:01.397649  8300 net.cpp:91] Creating Layer norm2
+I0906 13:34:01.397655  8300 net.cpp:411] norm2 <- conv2
+I0906 13:34:01.397667  8300 net.cpp:369] norm2 -> norm2
+I0906 13:34:01.397680  8300 net.cpp:121] Setting up norm2
+I0906 13:34:01.397699  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:01.397704  8300 net.cpp:134] Memory required for data: 662257600
+I0906 13:34:01.397709  8300 layer_factory.hpp:74] Creating layer pool2
+I0906 13:34:01.397729  8300 net.cpp:91] Creating Layer pool2
+I0906 13:34:01.397735  8300 net.cpp:411] pool2 <- norm2
+I0906 13:34:01.397748  8300 net.cpp:369] pool2 -> pool2
+I0906 13:34:01.397758  8300 net.cpp:121] Setting up pool2
+I0906 13:34:01.397776  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:01.397780  8300 net.cpp:134] Memory required for data: 679563200
+I0906 13:34:01.397830  8300 layer_factory.hpp:74] Creating layer conv3
+I0906 13:34:01.397851  8300 net.cpp:91] Creating Layer conv3
+I0906 13:34:01.397857  8300 net.cpp:411] conv3 <- pool2
+I0906 13:34:01.397871  8300 net.cpp:369] conv3 -> conv3
+I0906 13:34:01.397886  8300 net.cpp:121] Setting up conv3
+I0906 13:34:01.513005  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:01.513030  8300 net.cpp:134] Memory required for data: 705521600
+I0906 13:34:01.513072  8300 layer_factory.hpp:74] Creating layer relu3
+I0906 13:34:01.513104  8300 net.cpp:91] Creating Layer relu3
+I0906 13:34:01.513120  8300 net.cpp:411] relu3 <- conv3
+I0906 13:34:01.513149  8300 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:34:01.513164  8300 net.cpp:121] Setting up relu3
+I0906 13:34:01.513173  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:01.513177  8300 net.cpp:134] Memory required for data: 731480000
+I0906 13:34:01.513182  8300 layer_factory.hpp:74] Creating layer conv4
+I0906 13:34:01.513208  8300 net.cpp:91] Creating Layer conv4
+I0906 13:34:01.513214  8300 net.cpp:411] conv4 <- conv3
+I0906 13:34:01.513229  8300 net.cpp:369] conv4 -> conv4
+I0906 13:34:01.513244  8300 net.cpp:121] Setting up conv4
+I0906 13:34:01.539248  8304 data_layer.cpp:120] Prefetch batch: 186 ms.
+I0906 13:34:01.539295  8304 data_layer.cpp:121]      Read time: 22.695 ms.
+I0906 13:34:01.539304  8304 data_layer.cpp:122] Transform time: 161.707 ms.
+I0906 13:34:01.598980  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:01.599004  8300 net.cpp:134] Memory required for data: 757438400
+I0906 13:34:01.599028  8300 layer_factory.hpp:74] Creating layer relu4
+I0906 13:34:01.599059  8300 net.cpp:91] Creating Layer relu4
+I0906 13:34:01.599074  8300 net.cpp:411] relu4 <- conv4
+I0906 13:34:01.599100  8300 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:34:01.599117  8300 net.cpp:121] Setting up relu4
+I0906 13:34:01.599125  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:01.599129  8300 net.cpp:134] Memory required for data: 783396800
+I0906 13:34:01.599134  8300 layer_factory.hpp:74] Creating layer conv5
+I0906 13:34:01.599158  8300 net.cpp:91] Creating Layer conv5
+I0906 13:34:01.599164  8300 net.cpp:411] conv5 <- conv4
+I0906 13:34:01.599177  8300 net.cpp:369] conv5 -> conv5
+I0906 13:34:01.599191  8300 net.cpp:121] Setting up conv5
+I0906 13:34:01.658185  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:01.658205  8300 net.cpp:134] Memory required for data: 800702400
+I0906 13:34:01.658242  8300 layer_factory.hpp:74] Creating layer relu5
+I0906 13:34:01.658269  8300 net.cpp:91] Creating Layer relu5
+I0906 13:34:01.658283  8300 net.cpp:411] relu5 <- conv5
+I0906 13:34:01.658308  8300 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:34:01.658321  8300 net.cpp:121] Setting up relu5
+I0906 13:34:01.658330  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:01.658334  8300 net.cpp:134] Memory required for data: 818008000
+I0906 13:34:01.658339  8300 layer_factory.hpp:74] Creating layer pool5
+I0906 13:34:01.658357  8300 net.cpp:91] Creating Layer pool5
+I0906 13:34:01.658362  8300 net.cpp:411] pool5 <- conv5
+I0906 13:34:01.658375  8300 net.cpp:369] pool5 -> pool5
+I0906 13:34:01.658390  8300 net.cpp:121] Setting up pool5
+I0906 13:34:01.658407  8300 net.cpp:128] Top shape: 100 256 6 6 (921600)
+I0906 13:34:01.658412  8300 net.cpp:134] Memory required for data: 821694400
+I0906 13:34:01.658416  8300 layer_factory.hpp:74] Creating layer fc6
+I0906 13:34:01.658447  8300 net.cpp:91] Creating Layer fc6
+I0906 13:34:01.658453  8300 net.cpp:411] fc6 <- pool5
+I0906 13:34:01.658466  8300 net.cpp:369] fc6 -> fc6
+I0906 13:34:01.658480  8300 net.cpp:121] Setting up fc6
+I0906 13:34:06.571331  8300 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:06.571354  8300 net.cpp:134] Memory required for data: 823332800
+I0906 13:34:06.571382  8300 layer_factory.hpp:74] Creating layer relu6
+I0906 13:34:06.571415  8300 net.cpp:91] Creating Layer relu6
+I0906 13:34:06.571430  8300 net.cpp:411] relu6 <- fc6
+I0906 13:34:06.571456  8300 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:34:06.571521  8300 net.cpp:121] Setting up relu6
+I0906 13:34:06.571529  8300 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:06.571533  8300 net.cpp:134] Memory required for data: 824971200
+I0906 13:34:06.571538  8300 layer_factory.hpp:74] Creating layer fc7
+I0906 13:34:06.571558  8300 net.cpp:91] Creating Layer fc7
+I0906 13:34:06.571563  8300 net.cpp:411] fc7 <- fc6
+I0906 13:34:06.571578  8300 net.cpp:369] fc7 -> fc7
+I0906 13:34:06.571593  8300 net.cpp:121] Setting up fc7
+I0906 13:34:08.751106  8300 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:08.751129  8300 net.cpp:134] Memory required for data: 826609600
+I0906 13:34:08.751155  8300 layer_factory.hpp:74] Creating layer relu7
+I0906 13:34:08.751186  8300 net.cpp:91] Creating Layer relu7
+I0906 13:34:08.751202  8300 net.cpp:411] relu7 <- fc7
+I0906 13:34:08.751229  8300 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:34:08.751243  8300 net.cpp:121] Setting up relu7
+I0906 13:34:08.751251  8300 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:08.751255  8300 net.cpp:134] Memory required for data: 828248000
+I0906 13:34:08.751260  8300 layer_factory.hpp:74] Creating layer fc8
+I0906 13:34:08.751281  8300 net.cpp:91] Creating Layer fc8
+I0906 13:34:08.751286  8300 net.cpp:411] fc8 <- fc7
+I0906 13:34:08.751301  8300 net.cpp:369] fc8 -> fc8
+I0906 13:34:08.751315  8300 net.cpp:121] Setting up fc8
+I0906 13:34:09.287158  8300 net.cpp:128] Top shape: 100 1000 (100000)
+I0906 13:34:09.287181  8300 net.cpp:134] Memory required for data: 828648000
+I0906 13:34:09.287209  8300 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:09.287257  8300 net.cpp:91] Creating Layer loss
+I0906 13:34:09.287272  8300 net.cpp:411] loss <- fc8
+I0906 13:34:09.287295  8300 net.cpp:411] loss <- label
+I0906 13:34:09.287313  8300 net.cpp:369] loss -> loss
+I0906 13:34:09.287333  8300 net.cpp:121] Setting up loss
+I0906 13:34:09.287349  8300 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:09.287860  8300 net.cpp:128] Top shape: (1)
+I0906 13:34:09.287865  8300 net.cpp:130]     with loss weight 1
+I0906 13:34:09.287881  8300 net.cpp:134] Memory required for data: 828648004
+I0906 13:34:09.287890  8300 net.cpp:193] loss needs backward computation.
+I0906 13:34:09.287899  8300 net.cpp:193] fc8 needs backward computation.
+I0906 13:34:09.287904  8300 net.cpp:193] relu7 needs backward computation.
+I0906 13:34:09.287910  8300 net.cpp:193] fc7 needs backward computation.
+I0906 13:34:09.287916  8300 net.cpp:193] relu6 needs backward computation.
+I0906 13:34:09.287921  8300 net.cpp:193] fc6 needs backward computation.
+I0906 13:34:09.287935  8300 net.cpp:193] pool5 needs backward computation.
+I0906 13:34:09.287940  8300 net.cpp:193] relu5 needs backward computation.
+I0906 13:34:09.287946  8300 net.cpp:193] conv5 needs backward computation.
+I0906 13:34:09.287952  8300 net.cpp:193] relu4 needs backward computation.
+I0906 13:34:09.287958  8300 net.cpp:193] conv4 needs backward computation.
+I0906 13:34:09.287964  8300 net.cpp:193] relu3 needs backward computation.
+I0906 13:34:09.287969  8300 net.cpp:193] conv3 needs backward computation.
+I0906 13:34:09.287977  8300 net.cpp:193] pool2 needs backward computation.
+I0906 13:34:09.287983  8300 net.cpp:193] norm2 needs backward computation.
+I0906 13:34:09.287989  8300 net.cpp:193] relu2 needs backward computation.
+I0906 13:34:09.287996  8300 net.cpp:193] conv2 needs backward computation.
+I0906 13:34:09.288002  8300 net.cpp:193] pool1 needs backward computation.
+I0906 13:34:09.288007  8300 net.cpp:193] norm1 needs backward computation.
+I0906 13:34:09.288014  8300 net.cpp:193] relu1 needs backward computation.
+I0906 13:34:09.288019  8300 net.cpp:193] conv1 needs backward computation.
+I0906 13:34:09.288028  8300 net.cpp:195] data does not need backward computation.
+I0906 13:34:09.288034  8300 net.cpp:236] This network produces output loss
+I0906 13:34:09.288067  8300 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:34:09.288084  8300 net.cpp:248] Network initialization done.
+I0906 13:34:09.288087  8300 net.cpp:249] Memory required for data: 828648004
+I0906 13:34:09.289022  8300 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:34:09.289130  8300 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
+I0906 13:34:09.289348  8300 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TEST
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:34:09.289656  8300 net.cpp:68] Memory required for data: 0
+I0906 13:34:09.289702  8300 layer_factory.hpp:74] Creating layer data
+I0906 13:34:09.289721  8300 net.cpp:91] Creating Layer data
+I0906 13:34:09.289731  8300 net.cpp:369] data -> data
+I0906 13:34:09.289752  8300 net.cpp:369] data -> label
+I0906 13:34:09.289764  8300 net.cpp:121] Setting up data
+I0906 13:34:09.289772  8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:34:09.298058  8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
+I0906 13:34:09.298318  8300 data_layer.cpp:53] output data size: 50,3,227,227
+I0906 13:34:09.314699  8300 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:34:09.314806  8300 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:34:09.314834  8300 net.cpp:128] Top shape: 50 3 227 227 (7729350)
+I0906 13:34:09.314843  8300 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:09.314848  8300 net.cpp:134] Memory required for data: 30917600
+I0906 13:34:09.314882  8300 layer_factory.hpp:74] Creating layer label_data_1_split
+I0906 13:34:09.314973  8300 net.cpp:91] Creating Layer label_data_1_split
+I0906 13:34:09.314997  8300 net.cpp:411] label_data_1_split <- label
+I0906 13:34:09.315035  8300 net.cpp:369] label_data_1_split -> label_data_1_split_0
+I0906 13:34:09.315073  8300 net.cpp:369] label_data_1_split -> label_data_1_split_1
+I0906 13:34:09.315085  8300 net.cpp:121] Setting up label_data_1_split
+I0906 13:34:09.315116  8300 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:09.315124  8300 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:09.315127  8300 net.cpp:134] Memory required for data: 30918000
+I0906 13:34:09.315131  8300 layer_factory.hpp:74] Creating layer conv1
+I0906 13:34:09.315165  8300 net.cpp:91] Creating Layer conv1
+I0906 13:34:09.315171  8300 net.cpp:411] conv1 <- data
+I0906 13:34:09.315183  8300 net.cpp:369] conv1 -> conv1
+I0906 13:34:09.315198  8300 net.cpp:121] Setting up conv1
+I0906 13:34:09.319859  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:09.319864  8300 net.cpp:134] Memory required for data: 88998000
+I0906 13:34:09.319883  8300 layer_factory.hpp:74] Creating layer relu1
+I0906 13:34:09.319895  8300 net.cpp:91] Creating Layer relu1
+I0906 13:34:09.319901  8300 net.cpp:411] relu1 <- conv1
+I0906 13:34:09.319913  8300 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:34:09.319926  8300 net.cpp:121] Setting up relu1
+I0906 13:34:09.319933  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:09.319937  8300 net.cpp:134] Memory required for data: 147078000
+I0906 13:34:09.319942  8300 layer_factory.hpp:74] Creating layer norm1
+I0906 13:34:09.319962  8300 net.cpp:91] Creating Layer norm1
+I0906 13:34:09.319968  8300 net.cpp:411] norm1 <- conv1
+I0906 13:34:09.319980  8300 net.cpp:369] norm1 -> norm1
+I0906 13:34:09.319991  8300 net.cpp:121] Setting up norm1
+I0906 13:34:09.320009  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:09.320053  8300 net.cpp:134] Memory required for data: 205158000
+I0906 13:34:09.320060  8300 layer_factory.hpp:74] Creating layer pool1
+I0906 13:34:09.320075  8300 net.cpp:91] Creating Layer pool1
+I0906 13:34:09.320081  8300 net.cpp:411] pool1 <- norm1
+I0906 13:34:09.320093  8300 net.cpp:369] pool1 -> pool1
+I0906 13:34:09.320103  8300 net.cpp:121] Setting up pool1
+I0906 13:34:09.320122  8300 net.cpp:128] Top shape: 50 96 27 27 (3499200)
+I0906 13:34:09.320125  8300 net.cpp:134] Memory required for data: 219154800
+I0906 13:34:09.320130  8300 layer_factory.hpp:74] Creating layer conv2
+I0906 13:34:09.320143  8300 net.cpp:91] Creating Layer conv2
+I0906 13:34:09.320149  8300 net.cpp:411] conv2 <- pool1
+I0906 13:34:09.320163  8300 net.cpp:369] conv2 -> conv2
+I0906 13:34:09.320174  8300 net.cpp:121] Setting up conv2
+I0906 13:34:09.359275  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:09.359290  8300 net.cpp:134] Memory required for data: 256479600
+I0906 13:34:09.359316  8300 layer_factory.hpp:74] Creating layer relu2
+I0906 13:34:09.359336  8300 net.cpp:91] Creating Layer relu2
+I0906 13:34:09.359346  8300 net.cpp:411] relu2 <- conv2
+I0906 13:34:09.359365  8300 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:34:09.359395  8300 net.cpp:121] Setting up relu2
+I0906 13:34:09.359403  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:09.359407  8300 net.cpp:134] Memory required for data: 293804400
+I0906 13:34:09.359412  8300 layer_factory.hpp:74] Creating layer norm2
+I0906 13:34:09.359433  8300 net.cpp:91] Creating Layer norm2
+I0906 13:34:09.359438  8300 net.cpp:411] norm2 <- conv2
+I0906 13:34:09.359452  8300 net.cpp:369] norm2 -> norm2
+I0906 13:34:09.359467  8300 net.cpp:121] Setting up norm2
+I0906 13:34:09.359486  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:09.359490  8300 net.cpp:134] Memory required for data: 331129200
+I0906 13:34:09.359495  8300 layer_factory.hpp:74] Creating layer pool2
+I0906 13:34:09.359508  8300 net.cpp:91] Creating Layer pool2
+I0906 13:34:09.359514  8300 net.cpp:411] pool2 <- norm2
+I0906 13:34:09.359526  8300 net.cpp:369] pool2 -> pool2
+I0906 13:34:09.359537  8300 net.cpp:121] Setting up pool2
+I0906 13:34:09.359555  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:09.359558  8300 net.cpp:134] Memory required for data: 339782000
+I0906 13:34:09.359563  8300 layer_factory.hpp:74] Creating layer conv3
+I0906 13:34:09.359581  8300 net.cpp:91] Creating Layer conv3
+I0906 13:34:09.359587  8300 net.cpp:411] conv3 <- pool2
+I0906 13:34:09.359601  8300 net.cpp:369] conv3 -> conv3
+I0906 13:34:09.359613  8300 net.cpp:121] Setting up conv3
+I0906 13:34:09.410833  8305 data_layer.cpp:120] Prefetch batch: 95 ms.
+I0906 13:34:09.410863  8305 data_layer.cpp:121]      Read time: 11.984 ms.
+I0906 13:34:09.410871  8305 data_layer.cpp:122] Transform time: 82.885 ms.
+I0906 13:34:09.474556  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:09.474578  8300 net.cpp:134] Memory required for data: 352761200
+I0906 13:34:09.474618  8300 layer_factory.hpp:74] Creating layer relu3
+I0906 13:34:09.474648  8300 net.cpp:91] Creating Layer relu3
+I0906 13:34:09.474663  8300 net.cpp:411] relu3 <- conv3
+I0906 13:34:09.474689  8300 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:34:09.474704  8300 net.cpp:121] Setting up relu3
+I0906 13:34:09.474714  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:09.474717  8300 net.cpp:134] Memory required for data: 365740400
+I0906 13:34:09.474721  8300 layer_factory.hpp:74] Creating layer conv4
+I0906 13:34:09.474745  8300 net.cpp:91] Creating Layer conv4
+I0906 13:34:09.474751  8300 net.cpp:411] conv4 <- conv3
+I0906 13:34:09.474766  8300 net.cpp:369] conv4 -> conv4
+I0906 13:34:09.474781  8300 net.cpp:121] Setting up conv4
+I0906 13:34:09.562909  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:09.562930  8300 net.cpp:134] Memory required for data: 378719600
+I0906 13:34:09.562957  8300 layer_factory.hpp:74] Creating layer relu4
+I0906 13:34:09.562988  8300 net.cpp:91] Creating Layer relu4
+I0906 13:34:09.563051  8300 net.cpp:411] relu4 <- conv4
+I0906 13:34:09.563086  8300 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:34:09.563102  8300 net.cpp:121] Setting up relu4
+I0906 13:34:09.563112  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:09.563117  8300 net.cpp:134] Memory required for data: 391698800
+I0906 13:34:09.563122  8300 layer_factory.hpp:74] Creating layer conv5
+I0906 13:34:09.563146  8300 net.cpp:91] Creating Layer conv5
+I0906 13:34:09.563153  8300 net.cpp:411] conv5 <- conv4
+I0906 13:34:09.563168  8300 net.cpp:369] conv5 -> conv5
+I0906 13:34:09.563182  8300 net.cpp:121] Setting up conv5
+I0906 13:34:09.619202  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:09.619220  8300 net.cpp:134] Memory required for data: 400351600
+I0906 13:34:09.619256  8300 layer_factory.hpp:74] Creating layer relu5
+I0906 13:34:09.619284  8300 net.cpp:91] Creating Layer relu5
+I0906 13:34:09.619298  8300 net.cpp:411] relu5 <- conv5
+I0906 13:34:09.619321  8300 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:34:09.619336  8300 net.cpp:121] Setting up relu5
+I0906 13:34:09.619344  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:09.619349  8300 net.cpp:134] Memory required for data: 409004400
+I0906 13:34:09.619354  8300 layer_factory.hpp:74] Creating layer pool5
+I0906 13:34:09.619380  8300 net.cpp:91] Creating Layer pool5
+I0906 13:34:09.619386  8300 net.cpp:411] pool5 <- conv5
+I0906 13:34:09.619398  8300 net.cpp:369] pool5 -> pool5
+I0906 13:34:09.619411  8300 net.cpp:121] Setting up pool5
+I0906 13:34:09.619431  8300 net.cpp:128] Top shape: 50 256 6 6 (460800)
+I0906 13:34:09.619434  8300 net.cpp:134] Memory required for data: 410847600
+I0906 13:34:09.619439  8300 layer_factory.hpp:74] Creating layer fc6
+I0906 13:34:09.619457  8300 net.cpp:91] Creating Layer fc6
+I0906 13:34:09.619463  8300 net.cpp:411] fc6 <- pool5
+I0906 13:34:09.619477  8300 net.cpp:369] fc6 -> fc6
+I0906 13:34:09.619488  8300 net.cpp:121] Setting up fc6
+I0906 13:34:15.320122  8300 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:15.320147  8300 net.cpp:134] Memory required for data: 411666800
+I0906 13:34:15.320174  8300 layer_factory.hpp:74] Creating layer relu6
+I0906 13:34:15.320206  8300 net.cpp:91] Creating Layer relu6
+I0906 13:34:15.320222  8300 net.cpp:411] relu6 <- fc6
+I0906 13:34:15.320248  8300 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:34:15.320263  8300 net.cpp:121] Setting up relu6
+I0906 13:34:15.320272  8300 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:15.320276  8300 net.cpp:134] Memory required for data: 412486000
+I0906 13:34:15.320281  8300 layer_factory.hpp:74] Creating layer fc7
+I0906 13:34:15.320302  8300 net.cpp:91] Creating Layer fc7
+I0906 13:34:15.320308  8300 net.cpp:411] fc7 <- fc6
+I0906 13:34:15.320322  8300 net.cpp:369] fc7 -> fc7
+I0906 13:34:15.320338  8300 net.cpp:121] Setting up fc7
+I0906 13:34:17.700968  8300 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:17.700994  8300 net.cpp:134] Memory required for data: 413305200
+I0906 13:34:17.701020  8300 layer_factory.hpp:74] Creating layer relu7
+I0906 13:34:17.701052  8300 net.cpp:91] Creating Layer relu7
+I0906 13:34:17.701067  8300 net.cpp:411] relu7 <- fc7
+I0906 13:34:17.701093  8300 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:34:17.701109  8300 net.cpp:121] Setting up relu7
+I0906 13:34:17.701117  8300 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:17.701122  8300 net.cpp:134] Memory required for data: 414124400
+I0906 13:34:17.701125  8300 layer_factory.hpp:74] Creating layer fc8
+I0906 13:34:17.701146  8300 net.cpp:91] Creating Layer fc8
+I0906 13:34:17.701153  8300 net.cpp:411] fc8 <- fc7
+I0906 13:34:17.701166  8300 net.cpp:369] fc8 -> fc8
+I0906 13:34:17.701191  8300 net.cpp:121] Setting up fc8
+I0906 13:34:18.224659  8300 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:18.224681  8300 net.cpp:134] Memory required for data: 414324400
+I0906 13:34:18.224707  8300 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
+I0906 13:34:18.224737  8300 net.cpp:91] Creating Layer fc8_fc8_0_split
+I0906 13:34:18.224798  8300 net.cpp:411] fc8_fc8_0_split <- fc8
+I0906 13:34:18.224828  8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
+I0906 13:34:18.224848  8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
+I0906 13:34:18.224860  8300 net.cpp:121] Setting up fc8_fc8_0_split
+I0906 13:34:18.224876  8300 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:18.224882  8300 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:18.224886  8300 net.cpp:134] Memory required for data: 414724400
+I0906 13:34:18.224891  8300 layer_factory.hpp:74] Creating layer accuracy
+I0906 13:34:18.224922  8300 net.cpp:91] Creating Layer accuracy
+I0906 13:34:18.224927  8300 net.cpp:411] accuracy <- fc8_fc8_0_split_0
+I0906 13:34:18.224938  8300 net.cpp:411] accuracy <- label_data_1_split_0
+I0906 13:34:18.224949  8300 net.cpp:369] accuracy -> accuracy
+I0906 13:34:18.224961  8300 net.cpp:121] Setting up accuracy
+I0906 13:34:18.224977  8300 net.cpp:128] Top shape: (1)
+I0906 13:34:18.224980  8300 net.cpp:134] Memory required for data: 414724404
+I0906 13:34:18.224985  8300 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:18.224997  8300 net.cpp:91] Creating Layer loss
+I0906 13:34:18.225003  8300 net.cpp:411] loss <- fc8_fc8_0_split_1
+I0906 13:34:18.225013  8300 net.cpp:411] loss <- label_data_1_split_1
+I0906 13:34:18.225023  8300 net.cpp:369] loss -> loss
+I0906 13:34:18.225033  8300 net.cpp:121] Setting up loss
+I0906 13:34:18.225044  8300 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:18.225343  8300 net.cpp:128] Top shape: (1)
+I0906 13:34:18.225348  8300 net.cpp:130]     with loss weight 1
+I0906 13:34:18.225364  8300 net.cpp:134] Memory required for data: 414724408
+I0906 13:34:18.225371  8300 net.cpp:193] loss needs backward computation.
+I0906 13:34:18.225378  8300 net.cpp:195] accuracy does not need backward computation.
+I0906 13:34:18.225386  8300 net.cpp:193] fc8_fc8_0_split needs backward computation.
+I0906 13:34:18.225391  8300 net.cpp:193] fc8 needs backward computation.
+I0906 13:34:18.225397  8300 net.cpp:193] relu7 needs backward computation.
+I0906 13:34:18.225404  8300 net.cpp:193] fc7 needs backward computation.
+I0906 13:34:18.225409  8300 net.cpp:193] relu6 needs backward computation.
+I0906 13:34:18.225414  8300 net.cpp:193] fc6 needs backward computation.
+I0906 13:34:18.225420  8300 net.cpp:193] pool5 needs backward computation.
+I0906 13:34:18.225426  8300 net.cpp:193] relu5 needs backward computation.
+I0906 13:34:18.225431  8300 net.cpp:193] conv5 needs backward computation.
+I0906 13:34:18.225438  8300 net.cpp:193] relu4 needs backward computation.
+I0906 13:34:18.225443  8300 net.cpp:193] conv4 needs backward computation.
+I0906 13:34:18.225450  8300 net.cpp:193] relu3 needs backward computation.
+I0906 13:34:18.225455  8300 net.cpp:193] conv3 needs backward computation.
+I0906 13:34:18.225461  8300 net.cpp:193] pool2 needs backward computation.
+I0906 13:34:18.225466  8300 net.cpp:193] norm2 needs backward computation.
+I0906 13:34:18.225472  8300 net.cpp:193] relu2 needs backward computation.
+I0906 13:34:18.225477  8300 net.cpp:193] conv2 needs backward computation.
+I0906 13:34:18.225484  8300 net.cpp:193] pool1 needs backward computation.
+I0906 13:34:18.225491  8300 net.cpp:193] norm1 needs backward computation.
+I0906 13:34:18.225496  8300 net.cpp:193] relu1 needs backward computation.
+I0906 13:34:18.225502  8300 net.cpp:193] conv1 needs backward computation.
+I0906 13:34:18.225508  8300 net.cpp:195] label_data_1_split does not need backward computation.
+I0906 13:34:18.225515  8300 net.cpp:195] data does not need backward computation.
+I0906 13:34:18.225520  8300 net.cpp:236] This network produces output accuracy
+I0906 13:34:18.225527  8300 net.cpp:236] This network produces output loss
+I0906 13:34:18.225561  8300 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:34:18.225574  8300 net.cpp:248] Network initialization done.
+I0906 13:34:18.225579  8300 net.cpp:249] Memory required for data: 414724408
+I0906 13:34:18.225764  8300 solver.cpp:53] Solver scaffolding done.
+I0906 13:34:18.225879  8300 solver.cpp:270] Solving AlexNet
+I0906 13:34:18.225898  8300 solver.cpp:271] Learning Rate Policy: step
+I0906 13:34:18.227551  8300 solver.cpp:314] Iteration 0, Testing net (#0)
+I0906 13:34:18.227571  8300 net.cpp:696] Copying source layer data
+I0906 13:34:18.227577  8300 net.cpp:696] Copying source layer conv1
+I0906 13:34:18.230358  8300 net.cpp:696] Copying source layer relu1
+I0906 13:34:18.230398  8300 net.cpp:696] Copying source layer norm1
+I0906 13:34:18.230409  8300 net.cpp:696] Copying source layer pool1
+I0906 13:34:18.230419  8300 net.cpp:696] Copying source layer conv2
+I0906 13:34:18.230605  8300 net.cpp:696] Copying source layer relu2
+I0906 13:34:18.230624  8300 net.cpp:696] Copying source layer norm2
+I0906 13:34:18.230634  8300 net.cpp:696] Copying source layer pool2
+I0906 13:34:18.230644  8300 net.cpp:696] Copying source layer conv3
+I0906 13:34:18.231482  8300 net.cpp:696] Copying source layer relu3
+I0906 13:34:18.231510  8300 net.cpp:696] Copying source layer conv4
+I0906 13:34:18.232178  8300 net.cpp:696] Copying source layer relu4
+I0906 13:34:18.232195  8300 net.cpp:696] Copying source layer conv5
+I0906 13:34:18.232681  8300 net.cpp:696] Copying source layer relu5
+I0906 13:34:18.232697  8300 net.cpp:696] Copying source layer pool5
+I0906 13:34:18.232708  8300 net.cpp:696] Copying source layer fc6
+I0906 13:34:18.250728  8300 net.cpp:696] Copying source layer relu6
+I0906 13:34:18.250753  8300 net.cpp:696] Copying source layer fc7
+I0906 13:34:18.257216  8300 net.cpp:696] Copying source layer relu7
+I0906 13:34:18.257241  8300 net.cpp:696] Copying source layer fc8
+I0906 13:34:18.258977  8300 net.cpp:696] Copying source layer loss
+I0906 13:34:18.259091  8300 base_data_layer.cpp:89] Thread joined
+I0906 13:34:18.263509  8300 base_data_layer.cpp:93] Prefetch copied
+I0906 13:34:18.263875  8300 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:34:18.362475  8306 data_layer.cpp:120] Prefetch batch: 98 ms.
+I0906 13:34:18.362507  8306 data_layer.cpp:121]      Read time: 12.694 ms.
+I0906 13:34:18.362515  8306 data_layer.cpp:122] Transform time: 84.611 ms.
+I0906 13:34:21.291707  8300 solver.cpp:363]     Test net output #0: accuracy = 0
+I0906 13:34:21.291733  8300 solver.cpp:363]     Test net output #1: loss = 6.91228 (* 1 = 6.91228 loss)
+I0906 13:34:21.291775  8300 base_data_layer.cpp:89] Thread joined
+I0906 13:34:21.300678  8300 base_data_layer.cpp:93] Prefetch copied
+I0906 13:34:21.301050  8300 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:34:21.491194  8310 data_layer.cpp:120] Prefetch batch: 189 ms.
+I0906 13:34:21.491225  8310 data_layer.cpp:121]      Read time: 24.533 ms.
+I0906 13:34:21.491231  8310 data_layer.cpp:122] Transform time: 163.65 ms.
+I0906 13:34:28.088075  8300 solver.cpp:234] Iteration 0, loss = 0
+I0906 13:34:28.088134  8300 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
+I0906 13:34:28.088184  8300 solver.cpp:506] Iteration 0, lr = 0.01
+I0906 13:34:28.203598  8300 base_data_layer.cpp:89] Thread joined
+I0906 13:34:28.212023  8300 base_data_layer.cpp:93] Prefetch copied
+I0906 13:34:28.212162  8300 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:34:28.397155  8312 data_layer.cpp:120] Prefetch batch: 184 ms.
+I0906 13:34:28.397193  8312 data_layer.cpp:121]      Read time: 23.16 ms.
+I0906 13:34:28.397200  8312 data_layer.cpp:122] Transform time: 159.902 ms.
+I0906 13:34:30.978493  8300 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
new file mode 100644
index 00000000..93afd4cf
--- /dev/null
+++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
@@ -0,0 +1,1208 @@
+Log file created at: 2015/09/06 13:34:37
+Running on machine: AMD-RESEARCH
+Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
+I0906 13:34:37.585557  8316 caffe.cpp:114] Use GPU with device ID 0
+I0906 13:34:37.621670  8316 device.cpp:230] Number of platforms found:1
+I0906 13:34:37.621708  8316 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
+I0906 13:34:37.621721  8316 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
+I0906 13:34:37.621724  8316 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
+I0906 13:34:37.621728  8316 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
+I0906 13:34:37.621732  8316 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
+I0906 13:34:37.621739  8316 device.cpp:286] Number of devices found:1
+I0906 13:34:37.621743  8316 device.cpp:288] 	DeviceID:	0x22ed250
+I0906 13:34:37.621760  8316 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
+I0906 13:34:37.621767  8316 device.cpp:393] 	Is it integrated GPU?:	0
+I0906 13:34:37.621772  8316 device.cpp:393] 	Max clock frequency MHz:	930
+I0906 13:34:37.621775  8316 device.cpp:393] 	Host-Device unified mem:	0
+I0906 13:34:37.621779  8316 device.cpp:393] 	ECC support:	0
+I0906 13:34:37.621783  8316 device.cpp:393] 	Endian little:	1
+I0906 13:34:37.621788  8316 device.cpp:393] 	Max compute units:	44
+I0906 13:34:37.621791  8316 device.cpp:393] 	Max work group size:	256
+I0906 13:34:37.621796  8316 device.cpp:393] 	Max work item dimensions:	3
+I0906 13:34:37.621801  8316 device.cpp:393] 	Max work item sizes:	0x100
+I0906 13:34:37.621806  8316 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
+I0906 13:34:37.621811  8316 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
+I0906 13:34:37.621815  8316 device.cpp:393] 	Max mem alloc size:	4244635648
+I0906 13:34:37.621819  8316 device.cpp:393] 	Global mem size:	16878927872
+I0906 13:34:37.621822  8316 device.cpp:393] 	Local mem size:	32768
+I0906 13:34:37.621830  8316 device.cpp:96] Picked device type : GPU 0
+I0906 13:34:40.036291  8316 device.cpp:152] Build Program
+I0906 13:34:40.036520  8316 caffe.cpp:122] Starting Optimization
+I0906 13:34:40.036612  8316 solver.cpp:40] Initializing solver from parameters: 
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+display: 1
+max_iter: 10
+lr_policy: "step"
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+stepsize: 100000
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+I0906 13:34:40.036731  8316 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:34:40.037874  8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
+I0906 13:34:40.037925  8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
+I0906 13:34:40.038099  8316 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TRAIN
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:34:40.038537  8316 net.cpp:68] Memory required for data: 0
+I0906 13:34:40.038749  8316 layer_factory.hpp:74] Creating layer data
+I0906 13:34:40.038802  8316 net.cpp:91] Creating Layer data
+I0906 13:34:40.038825  8316 net.cpp:369] data -> data
+I0906 13:34:40.038928  8316 net.cpp:369] data -> label
+I0906 13:34:40.038950  8316 net.cpp:121] Setting up data
+I0906 13:34:40.038962  8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:34:40.048738  8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
+I0906 13:34:40.049080  8316 data_layer.cpp:53] output data size: 100,3,227,227
+I0906 13:34:40.081225  8316 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:34:40.081426  8316 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:34:40.081490  8316 net.cpp:128] Top shape: 100 3 227 227 (15458700)
+I0906 13:34:40.081500  8316 net.cpp:128] Top shape: 100 (100)
+I0906 13:34:40.081504  8316 net.cpp:134] Memory required for data: 61835200
+I0906 13:34:40.081537  8316 layer_factory.hpp:74] Creating layer conv1
+I0906 13:34:40.081619  8316 net.cpp:91] Creating Layer conv1
+I0906 13:34:40.081641  8316 net.cpp:411] conv1 <- data
+I0906 13:34:40.081694  8316 net.cpp:369] conv1 -> conv1
+I0906 13:34:40.081758  8316 net.cpp:121] Setting up conv1
+I0906 13:34:40.088135  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:40.088160  8316 net.cpp:134] Memory required for data: 177995200
+I0906 13:34:40.088239  8316 layer_factory.hpp:74] Creating layer relu1
+I0906 13:34:40.088297  8316 net.cpp:91] Creating Layer relu1
+I0906 13:34:40.088315  8316 net.cpp:411] relu1 <- conv1
+I0906 13:34:40.088351  8316 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:34:40.088372  8316 net.cpp:121] Setting up relu1
+I0906 13:34:40.088385  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:40.088390  8316 net.cpp:134] Memory required for data: 294155200
+I0906 13:34:40.088397  8316 layer_factory.hpp:74] Creating layer norm1
+I0906 13:34:40.088435  8316 net.cpp:91] Creating Layer norm1
+I0906 13:34:40.088444  8316 net.cpp:411] norm1 <- conv1
+I0906 13:34:40.088466  8316 net.cpp:369] norm1 -> norm1
+I0906 13:34:40.088486  8316 net.cpp:121] Setting up norm1
+I0906 13:34:40.088531  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:34:40.088537  8316 net.cpp:134] Memory required for data: 410315200
+I0906 13:34:40.088543  8316 layer_factory.hpp:74] Creating layer pool1
+I0906 13:34:40.088580  8316 net.cpp:91] Creating Layer pool1
+I0906 13:34:40.088590  8316 net.cpp:411] pool1 <- norm1
+I0906 13:34:40.088613  8316 net.cpp:369] pool1 -> pool1
+I0906 13:34:40.088637  8316 net.cpp:121] Setting up pool1
+I0906 13:34:40.088686  8316 net.cpp:128] Top shape: 100 96 27 27 (6998400)
+I0906 13:34:40.088691  8316 net.cpp:134] Memory required for data: 438308800
+I0906 13:34:40.088701  8316 layer_factory.hpp:74] Creating layer conv2
+I0906 13:34:40.088739  8316 net.cpp:91] Creating Layer conv2
+I0906 13:34:40.088750  8316 net.cpp:411] conv2 <- pool1
+I0906 13:34:40.088783  8316 net.cpp:369] conv2 -> conv2
+I0906 13:34:40.088804  8316 net.cpp:121] Setting up conv2
+I0906 13:34:40.129534  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:40.129550  8316 net.cpp:134] Memory required for data: 512958400
+I0906 13:34:40.129585  8316 layer_factory.hpp:74] Creating layer relu2
+I0906 13:34:40.129613  8316 net.cpp:91] Creating Layer relu2
+I0906 13:34:40.129624  8316 net.cpp:411] relu2 <- conv2
+I0906 13:34:40.129647  8316 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:34:40.129662  8316 net.cpp:121] Setting up relu2
+I0906 13:34:40.129670  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:40.129674  8316 net.cpp:134] Memory required for data: 587608000
+I0906 13:34:40.129679  8316 layer_factory.hpp:74] Creating layer norm2
+I0906 13:34:40.129698  8316 net.cpp:91] Creating Layer norm2
+I0906 13:34:40.129703  8316 net.cpp:411] norm2 <- conv2
+I0906 13:34:40.129717  8316 net.cpp:369] norm2 -> norm2
+I0906 13:34:40.129730  8316 net.cpp:121] Setting up norm2
+I0906 13:34:40.129750  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:34:40.129755  8316 net.cpp:134] Memory required for data: 662257600
+I0906 13:34:40.129760  8316 layer_factory.hpp:74] Creating layer pool2
+I0906 13:34:40.129783  8316 net.cpp:91] Creating Layer pool2
+I0906 13:34:40.129789  8316 net.cpp:411] pool2 <- norm2
+I0906 13:34:40.129802  8316 net.cpp:369] pool2 -> pool2
+I0906 13:34:40.129813  8316 net.cpp:121] Setting up pool2
+I0906 13:34:40.129832  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:40.129837  8316 net.cpp:134] Memory required for data: 679563200
+I0906 13:34:40.129887  8316 layer_factory.hpp:74] Creating layer conv3
+I0906 13:34:40.129910  8316 net.cpp:91] Creating Layer conv3
+I0906 13:34:40.129916  8316 net.cpp:411] conv3 <- pool2
+I0906 13:34:40.129933  8316 net.cpp:369] conv3 -> conv3
+I0906 13:34:40.129948  8316 net.cpp:121] Setting up conv3
+I0906 13:34:40.246141  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:40.246165  8316 net.cpp:134] Memory required for data: 705521600
+I0906 13:34:40.246211  8316 layer_factory.hpp:74] Creating layer relu3
+I0906 13:34:40.246247  8316 net.cpp:91] Creating Layer relu3
+I0906 13:34:40.246261  8316 net.cpp:411] relu3 <- conv3
+I0906 13:34:40.246287  8316 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:34:40.246304  8316 net.cpp:121] Setting up relu3
+I0906 13:34:40.246314  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:40.246317  8316 net.cpp:134] Memory required for data: 731480000
+I0906 13:34:40.246322  8316 layer_factory.hpp:74] Creating layer conv4
+I0906 13:34:40.246351  8316 net.cpp:91] Creating Layer conv4
+I0906 13:34:40.246356  8316 net.cpp:411] conv4 <- conv3
+I0906 13:34:40.246372  8316 net.cpp:369] conv4 -> conv4
+I0906 13:34:40.246387  8316 net.cpp:121] Setting up conv4
+I0906 13:34:40.273671  8320 data_layer.cpp:120] Prefetch batch: 191 ms.
+I0906 13:34:40.273718  8320 data_layer.cpp:121]      Read time: 24.494 ms.
+I0906 13:34:40.273727  8320 data_layer.cpp:122] Transform time: 165.29 ms.
+I0906 13:34:40.332166  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:40.332187  8316 net.cpp:134] Memory required for data: 757438400
+I0906 13:34:40.332214  8316 layer_factory.hpp:74] Creating layer relu4
+I0906 13:34:40.332247  8316 net.cpp:91] Creating Layer relu4
+I0906 13:34:40.332262  8316 net.cpp:411] relu4 <- conv4
+I0906 13:34:40.332288  8316 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:34:40.332304  8316 net.cpp:121] Setting up relu4
+I0906 13:34:40.332314  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:34:40.332317  8316 net.cpp:134] Memory required for data: 783396800
+I0906 13:34:40.332321  8316 layer_factory.hpp:74] Creating layer conv5
+I0906 13:34:40.332350  8316 net.cpp:91] Creating Layer conv5
+I0906 13:34:40.332355  8316 net.cpp:411] conv5 <- conv4
+I0906 13:34:40.332371  8316 net.cpp:369] conv5 -> conv5
+I0906 13:34:40.332386  8316 net.cpp:121] Setting up conv5
+I0906 13:34:40.388872  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:40.388891  8316 net.cpp:134] Memory required for data: 800702400
+I0906 13:34:40.388931  8316 layer_factory.hpp:74] Creating layer relu5
+I0906 13:34:40.388959  8316 net.cpp:91] Creating Layer relu5
+I0906 13:34:40.388972  8316 net.cpp:411] relu5 <- conv5
+I0906 13:34:40.388995  8316 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:34:40.389010  8316 net.cpp:121] Setting up relu5
+I0906 13:34:40.389019  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:34:40.389024  8316 net.cpp:134] Memory required for data: 818008000
+I0906 13:34:40.389029  8316 layer_factory.hpp:74] Creating layer pool5
+I0906 13:34:40.389049  8316 net.cpp:91] Creating Layer pool5
+I0906 13:34:40.389053  8316 net.cpp:411] pool5 <- conv5
+I0906 13:34:40.389067  8316 net.cpp:369] pool5 -> pool5
+I0906 13:34:40.389081  8316 net.cpp:121] Setting up pool5
+I0906 13:34:40.389102  8316 net.cpp:128] Top shape: 100 256 6 6 (921600)
+I0906 13:34:40.389107  8316 net.cpp:134] Memory required for data: 821694400
+I0906 13:34:40.389112  8316 layer_factory.hpp:74] Creating layer fc6
+I0906 13:34:40.389147  8316 net.cpp:91] Creating Layer fc6
+I0906 13:34:40.389153  8316 net.cpp:411] fc6 <- pool5
+I0906 13:34:40.389169  8316 net.cpp:369] fc6 -> fc6
+I0906 13:34:40.389183  8316 net.cpp:121] Setting up fc6
+I0906 13:34:45.208031  8316 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:45.208055  8316 net.cpp:134] Memory required for data: 823332800
+I0906 13:34:45.208081  8316 layer_factory.hpp:74] Creating layer relu6
+I0906 13:34:45.208112  8316 net.cpp:91] Creating Layer relu6
+I0906 13:34:45.208128  8316 net.cpp:411] relu6 <- fc6
+I0906 13:34:45.208154  8316 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:34:45.208210  8316 net.cpp:121] Setting up relu6
+I0906 13:34:45.208220  8316 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:45.208223  8316 net.cpp:134] Memory required for data: 824971200
+I0906 13:34:45.208228  8316 layer_factory.hpp:74] Creating layer fc7
+I0906 13:34:45.208250  8316 net.cpp:91] Creating Layer fc7
+I0906 13:34:45.208256  8316 net.cpp:411] fc7 <- fc6
+I0906 13:34:45.208273  8316 net.cpp:369] fc7 -> fc7
+I0906 13:34:45.208288  8316 net.cpp:121] Setting up fc7
+I0906 13:34:47.352208  8316 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:47.352234  8316 net.cpp:134] Memory required for data: 826609600
+I0906 13:34:47.352262  8316 layer_factory.hpp:74] Creating layer relu7
+I0906 13:34:47.352295  8316 net.cpp:91] Creating Layer relu7
+I0906 13:34:47.352311  8316 net.cpp:411] relu7 <- fc7
+I0906 13:34:47.352339  8316 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:34:47.352355  8316 net.cpp:121] Setting up relu7
+I0906 13:34:47.352363  8316 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:34:47.352368  8316 net.cpp:134] Memory required for data: 828248000
+I0906 13:34:47.352373  8316 layer_factory.hpp:74] Creating layer fc8
+I0906 13:34:47.352396  8316 net.cpp:91] Creating Layer fc8
+I0906 13:34:47.352402  8316 net.cpp:411] fc8 <- fc7
+I0906 13:34:47.352418  8316 net.cpp:369] fc8 -> fc8
+I0906 13:34:47.352433  8316 net.cpp:121] Setting up fc8
+I0906 13:34:47.878074  8316 net.cpp:128] Top shape: 100 1000 (100000)
+I0906 13:34:47.878098  8316 net.cpp:134] Memory required for data: 828648000
+I0906 13:34:47.878126  8316 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:47.878178  8316 net.cpp:91] Creating Layer loss
+I0906 13:34:47.878195  8316 net.cpp:411] loss <- fc8
+I0906 13:34:47.878217  8316 net.cpp:411] loss <- label
+I0906 13:34:47.878237  8316 net.cpp:369] loss -> loss
+I0906 13:34:47.878255  8316 net.cpp:121] Setting up loss
+I0906 13:34:47.878273  8316 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:47.878825  8316 net.cpp:128] Top shape: (1)
+I0906 13:34:47.878831  8316 net.cpp:130]     with loss weight 1
+I0906 13:34:47.878847  8316 net.cpp:134] Memory required for data: 828648004
+I0906 13:34:47.878856  8316 net.cpp:193] loss needs backward computation.
+I0906 13:34:47.878865  8316 net.cpp:193] fc8 needs backward computation.
+I0906 13:34:47.878870  8316 net.cpp:193] relu7 needs backward computation.
+I0906 13:34:47.878876  8316 net.cpp:193] fc7 needs backward computation.
+I0906 13:34:47.878882  8316 net.cpp:193] relu6 needs backward computation.
+I0906 13:34:47.878888  8316 net.cpp:193] fc6 needs backward computation.
+I0906 13:34:47.878895  8316 net.cpp:193] pool5 needs backward computation.
+I0906 13:34:47.878901  8316 net.cpp:193] relu5 needs backward computation.
+I0906 13:34:47.878906  8316 net.cpp:193] conv5 needs backward computation.
+I0906 13:34:47.878911  8316 net.cpp:193] relu4 needs backward computation.
+I0906 13:34:47.878917  8316 net.cpp:193] conv4 needs backward computation.
+I0906 13:34:47.878923  8316 net.cpp:193] relu3 needs backward computation.
+I0906 13:34:47.878928  8316 net.cpp:193] conv3 needs backward computation.
+I0906 13:34:47.878936  8316 net.cpp:193] pool2 needs backward computation.
+I0906 13:34:47.878942  8316 net.cpp:193] norm2 needs backward computation.
+I0906 13:34:47.878948  8316 net.cpp:193] relu2 needs backward computation.
+I0906 13:34:47.878953  8316 net.cpp:193] conv2 needs backward computation.
+I0906 13:34:47.878959  8316 net.cpp:193] pool1 needs backward computation.
+I0906 13:34:47.878965  8316 net.cpp:193] norm1 needs backward computation.
+I0906 13:34:47.878972  8316 net.cpp:193] relu1 needs backward computation.
+I0906 13:34:47.878978  8316 net.cpp:193] conv1 needs backward computation.
+I0906 13:34:47.878984  8316 net.cpp:195] data does not need backward computation.
+I0906 13:34:47.878993  8316 net.cpp:236] This network produces output loss
+I0906 13:34:47.879026  8316 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:34:47.879042  8316 net.cpp:248] Network initialization done.
+I0906 13:34:47.879045  8316 net.cpp:249] Memory required for data: 828648004
+I0906 13:34:47.880003  8316 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:34:47.880131  8316 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
+I0906 13:34:47.880362  8316 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TEST
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:34:47.880718  8316 net.cpp:68] Memory required for data: 0
+I0906 13:34:47.880764  8316 layer_factory.hpp:74] Creating layer data
+I0906 13:34:47.880786  8316 net.cpp:91] Creating Layer data
+I0906 13:34:47.880797  8316 net.cpp:369] data -> data
+I0906 13:34:47.880820  8316 net.cpp:369] data -> label
+I0906 13:34:47.880832  8316 net.cpp:121] Setting up data
+I0906 13:34:47.880839  8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:34:47.890487  8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
+I0906 13:34:47.890738  8316 data_layer.cpp:53] output data size: 50,3,227,227
+I0906 13:34:47.907624  8316 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:34:47.907733  8316 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:34:47.907762  8316 net.cpp:128] Top shape: 50 3 227 227 (7729350)
+I0906 13:34:47.907769  8316 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:47.907773  8316 net.cpp:134] Memory required for data: 30917600
+I0906 13:34:47.907805  8316 layer_factory.hpp:74] Creating layer label_data_1_split
+I0906 13:34:47.907896  8316 net.cpp:91] Creating Layer label_data_1_split
+I0906 13:34:47.907917  8316 net.cpp:411] label_data_1_split <- label
+I0906 13:34:47.907979  8316 net.cpp:369] label_data_1_split -> label_data_1_split_0
+I0906 13:34:47.908016  8316 net.cpp:369] label_data_1_split -> label_data_1_split_1
+I0906 13:34:47.908028  8316 net.cpp:121] Setting up label_data_1_split
+I0906 13:34:47.908057  8316 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:47.908064  8316 net.cpp:128] Top shape: 50 (50)
+I0906 13:34:47.908068  8316 net.cpp:134] Memory required for data: 30918000
+I0906 13:34:47.908073  8316 layer_factory.hpp:74] Creating layer conv1
+I0906 13:34:47.908112  8316 net.cpp:91] Creating Layer conv1
+I0906 13:34:47.908118  8316 net.cpp:411] conv1 <- data
+I0906 13:34:47.908133  8316 net.cpp:369] conv1 -> conv1
+I0906 13:34:47.908148  8316 net.cpp:121] Setting up conv1
+I0906 13:34:47.912806  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:47.912811  8316 net.cpp:134] Memory required for data: 88998000
+I0906 13:34:47.912832  8316 layer_factory.hpp:74] Creating layer relu1
+I0906 13:34:47.912844  8316 net.cpp:91] Creating Layer relu1
+I0906 13:34:47.912850  8316 net.cpp:411] relu1 <- conv1
+I0906 13:34:47.912863  8316 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:34:47.912873  8316 net.cpp:121] Setting up relu1
+I0906 13:34:47.912880  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:47.912883  8316 net.cpp:134] Memory required for data: 147078000
+I0906 13:34:47.912889  8316 layer_factory.hpp:74] Creating layer norm1
+I0906 13:34:47.912907  8316 net.cpp:91] Creating Layer norm1
+I0906 13:34:47.912912  8316 net.cpp:411] norm1 <- conv1
+I0906 13:34:47.912925  8316 net.cpp:369] norm1 -> norm1
+I0906 13:34:47.912936  8316 net.cpp:121] Setting up norm1
+I0906 13:34:47.912955  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:34:47.912999  8316 net.cpp:134] Memory required for data: 205158000
+I0906 13:34:47.913004  8316 layer_factory.hpp:74] Creating layer pool1
+I0906 13:34:47.913022  8316 net.cpp:91] Creating Layer pool1
+I0906 13:34:47.913027  8316 net.cpp:411] pool1 <- norm1
+I0906 13:34:47.913040  8316 net.cpp:369] pool1 -> pool1
+I0906 13:34:47.913050  8316 net.cpp:121] Setting up pool1
+I0906 13:34:47.913069  8316 net.cpp:128] Top shape: 50 96 27 27 (3499200)
+I0906 13:34:47.913074  8316 net.cpp:134] Memory required for data: 219154800
+I0906 13:34:47.913079  8316 layer_factory.hpp:74] Creating layer conv2
+I0906 13:34:47.913091  8316 net.cpp:91] Creating Layer conv2
+I0906 13:34:47.913096  8316 net.cpp:411] conv2 <- pool1
+I0906 13:34:47.913111  8316 net.cpp:369] conv2 -> conv2
+I0906 13:34:47.913123  8316 net.cpp:121] Setting up conv2
+I0906 13:34:47.952414  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:47.952428  8316 net.cpp:134] Memory required for data: 256479600
+I0906 13:34:47.952455  8316 layer_factory.hpp:74] Creating layer relu2
+I0906 13:34:47.952477  8316 net.cpp:91] Creating Layer relu2
+I0906 13:34:47.952487  8316 net.cpp:411] relu2 <- conv2
+I0906 13:34:47.952507  8316 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:34:47.952518  8316 net.cpp:121] Setting up relu2
+I0906 13:34:47.952527  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:47.952532  8316 net.cpp:134] Memory required for data: 293804400
+I0906 13:34:47.952536  8316 layer_factory.hpp:74] Creating layer norm2
+I0906 13:34:47.952558  8316 net.cpp:91] Creating Layer norm2
+I0906 13:34:47.952564  8316 net.cpp:411] norm2 <- conv2
+I0906 13:34:47.952577  8316 net.cpp:369] norm2 -> norm2
+I0906 13:34:47.952591  8316 net.cpp:121] Setting up norm2
+I0906 13:34:47.952610  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:34:47.952615  8316 net.cpp:134] Memory required for data: 331129200
+I0906 13:34:47.952620  8316 layer_factory.hpp:74] Creating layer pool2
+I0906 13:34:47.952635  8316 net.cpp:91] Creating Layer pool2
+I0906 13:34:47.952641  8316 net.cpp:411] pool2 <- norm2
+I0906 13:34:47.952653  8316 net.cpp:369] pool2 -> pool2
+I0906 13:34:47.952663  8316 net.cpp:121] Setting up pool2
+I0906 13:34:47.952682  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:47.952685  8316 net.cpp:134] Memory required for data: 339782000
+I0906 13:34:47.952690  8316 layer_factory.hpp:74] Creating layer conv3
+I0906 13:34:47.952713  8316 net.cpp:91] Creating Layer conv3
+I0906 13:34:47.952718  8316 net.cpp:411] conv3 <- pool2
+I0906 13:34:47.952733  8316 net.cpp:369] conv3 -> conv3
+I0906 13:34:47.952744  8316 net.cpp:121] Setting up conv3
+I0906 13:34:48.002686  8321 data_layer.cpp:120] Prefetch batch: 94 ms.
+I0906 13:34:48.002718  8321 data_layer.cpp:121]      Read time: 12.003 ms.
+I0906 13:34:48.002725  8321 data_layer.cpp:122] Transform time: 81.802 ms.
+I0906 13:34:48.066742  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:48.066764  8316 net.cpp:134] Memory required for data: 352761200
+I0906 13:34:48.066805  8316 layer_factory.hpp:74] Creating layer relu3
+I0906 13:34:48.066839  8316 net.cpp:91] Creating Layer relu3
+I0906 13:34:48.066854  8316 net.cpp:411] relu3 <- conv3
+I0906 13:34:48.066880  8316 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:34:48.066897  8316 net.cpp:121] Setting up relu3
+I0906 13:34:48.066906  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:48.066910  8316 net.cpp:134] Memory required for data: 365740400
+I0906 13:34:48.066915  8316 layer_factory.hpp:74] Creating layer conv4
+I0906 13:34:48.066942  8316 net.cpp:91] Creating Layer conv4
+I0906 13:34:48.066947  8316 net.cpp:411] conv4 <- conv3
+I0906 13:34:48.066964  8316 net.cpp:369] conv4 -> conv4
+I0906 13:34:48.066979  8316 net.cpp:121] Setting up conv4
+I0906 13:34:48.151291  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:48.151312  8316 net.cpp:134] Memory required for data: 378719600
+I0906 13:34:48.151340  8316 layer_factory.hpp:74] Creating layer relu4
+I0906 13:34:48.151372  8316 net.cpp:91] Creating Layer relu4
+I0906 13:34:48.151430  8316 net.cpp:411] relu4 <- conv4
+I0906 13:34:48.151458  8316 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:34:48.151473  8316 net.cpp:121] Setting up relu4
+I0906 13:34:48.151482  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:34:48.151486  8316 net.cpp:134] Memory required for data: 391698800
+I0906 13:34:48.151491  8316 layer_factory.hpp:74] Creating layer conv5
+I0906 13:34:48.151517  8316 net.cpp:91] Creating Layer conv5
+I0906 13:34:48.151523  8316 net.cpp:411] conv5 <- conv4
+I0906 13:34:48.151540  8316 net.cpp:369] conv5 -> conv5
+I0906 13:34:48.151554  8316 net.cpp:121] Setting up conv5
+I0906 13:34:48.208228  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:48.208250  8316 net.cpp:134] Memory required for data: 400351600
+I0906 13:34:48.208292  8316 layer_factory.hpp:74] Creating layer relu5
+I0906 13:34:48.208322  8316 net.cpp:91] Creating Layer relu5
+I0906 13:34:48.208336  8316 net.cpp:411] relu5 <- conv5
+I0906 13:34:48.208360  8316 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:34:48.208376  8316 net.cpp:121] Setting up relu5
+I0906 13:34:48.208385  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:34:48.208389  8316 net.cpp:134] Memory required for data: 409004400
+I0906 13:34:48.208395  8316 layer_factory.hpp:74] Creating layer pool5
+I0906 13:34:48.208425  8316 net.cpp:91] Creating Layer pool5
+I0906 13:34:48.208431  8316 net.cpp:411] pool5 <- conv5
+I0906 13:34:48.208446  8316 net.cpp:369] pool5 -> pool5
+I0906 13:34:48.208459  8316 net.cpp:121] Setting up pool5
+I0906 13:34:48.208479  8316 net.cpp:128] Top shape: 50 256 6 6 (460800)
+I0906 13:34:48.208483  8316 net.cpp:134] Memory required for data: 410847600
+I0906 13:34:48.208488  8316 layer_factory.hpp:74] Creating layer fc6
+I0906 13:34:48.208510  8316 net.cpp:91] Creating Layer fc6
+I0906 13:34:48.208516  8316 net.cpp:411] fc6 <- pool5
+I0906 13:34:48.208530  8316 net.cpp:369] fc6 -> fc6
+I0906 13:34:48.208544  8316 net.cpp:121] Setting up fc6
+I0906 13:34:52.951850  8316 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:52.951876  8316 net.cpp:134] Memory required for data: 411666800
+I0906 13:34:52.951903  8316 layer_factory.hpp:74] Creating layer relu6
+I0906 13:34:52.951944  8316 net.cpp:91] Creating Layer relu6
+I0906 13:34:52.951961  8316 net.cpp:411] relu6 <- fc6
+I0906 13:34:52.951987  8316 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:34:52.952003  8316 net.cpp:121] Setting up relu6
+I0906 13:34:52.952010  8316 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:52.952014  8316 net.cpp:134] Memory required for data: 412486000
+I0906 13:34:52.952019  8316 layer_factory.hpp:74] Creating layer fc7
+I0906 13:34:52.952044  8316 net.cpp:91] Creating Layer fc7
+I0906 13:34:52.952049  8316 net.cpp:411] fc7 <- fc6
+I0906 13:34:52.952065  8316 net.cpp:369] fc7 -> fc7
+I0906 13:34:52.952080  8316 net.cpp:121] Setting up fc7
+I0906 13:34:55.059911  8316 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:55.059948  8316 net.cpp:134] Memory required for data: 413305200
+I0906 13:34:55.059976  8316 layer_factory.hpp:74] Creating layer relu7
+I0906 13:34:55.060010  8316 net.cpp:91] Creating Layer relu7
+I0906 13:34:55.060025  8316 net.cpp:411] relu7 <- fc7
+I0906 13:34:55.060053  8316 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:34:55.060070  8316 net.cpp:121] Setting up relu7
+I0906 13:34:55.060078  8316 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:34:55.060082  8316 net.cpp:134] Memory required for data: 414124400
+I0906 13:34:55.060087  8316 layer_factory.hpp:74] Creating layer fc8
+I0906 13:34:55.060109  8316 net.cpp:91] Creating Layer fc8
+I0906 13:34:55.060116  8316 net.cpp:411] fc8 <- fc7
+I0906 13:34:55.060132  8316 net.cpp:369] fc8 -> fc8
+I0906 13:34:55.060156  8316 net.cpp:121] Setting up fc8
+I0906 13:34:55.576926  8316 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:55.576946  8316 net.cpp:134] Memory required for data: 414324400
+I0906 13:34:55.576972  8316 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
+I0906 13:34:55.577006  8316 net.cpp:91] Creating Layer fc8_fc8_0_split
+I0906 13:34:55.577097  8316 net.cpp:411] fc8_fc8_0_split <- fc8
+I0906 13:34:55.577136  8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
+I0906 13:34:55.577162  8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
+I0906 13:34:55.577173  8316 net.cpp:121] Setting up fc8_fc8_0_split
+I0906 13:34:55.577191  8316 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:55.577198  8316 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:34:55.577201  8316 net.cpp:134] Memory required for data: 414724400
+I0906 13:34:55.577206  8316 layer_factory.hpp:74] Creating layer accuracy
+I0906 13:34:55.577237  8316 net.cpp:91] Creating Layer accuracy
+I0906 13:34:55.577244  8316 net.cpp:411] accuracy <- fc8_fc8_0_split_0
+I0906 13:34:55.577255  8316 net.cpp:411] accuracy <- label_data_1_split_0
+I0906 13:34:55.577266  8316 net.cpp:369] accuracy -> accuracy
+I0906 13:34:55.577277  8316 net.cpp:121] Setting up accuracy
+I0906 13:34:55.577293  8316 net.cpp:128] Top shape: (1)
+I0906 13:34:55.577297  8316 net.cpp:134] Memory required for data: 414724404
+I0906 13:34:55.577302  8316 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:55.577314  8316 net.cpp:91] Creating Layer loss
+I0906 13:34:55.577321  8316 net.cpp:411] loss <- fc8_fc8_0_split_1
+I0906 13:34:55.577332  8316 net.cpp:411] loss <- label_data_1_split_1
+I0906 13:34:55.577342  8316 net.cpp:369] loss -> loss
+I0906 13:34:55.577353  8316 net.cpp:121] Setting up loss
+I0906 13:34:55.577363  8316 layer_factory.hpp:74] Creating layer loss
+I0906 13:34:55.577759  8316 net.cpp:128] Top shape: (1)
+I0906 13:34:55.577764  8316 net.cpp:130]     with loss weight 1
+I0906 13:34:55.577780  8316 net.cpp:134] Memory required for data: 414724408
+I0906 13:34:55.577786  8316 net.cpp:193] loss needs backward computation.
+I0906 13:34:55.577795  8316 net.cpp:195] accuracy does not need backward computation.
+I0906 13:34:55.577801  8316 net.cpp:193] fc8_fc8_0_split needs backward computation.
+I0906 13:34:55.577807  8316 net.cpp:193] fc8 needs backward computation.
+I0906 13:34:55.577813  8316 net.cpp:193] relu7 needs backward computation.
+I0906 13:34:55.577818  8316 net.cpp:193] fc7 needs backward computation.
+I0906 13:34:55.577824  8316 net.cpp:193] relu6 needs backward computation.
+I0906 13:34:55.577831  8316 net.cpp:193] fc6 needs backward computation.
+I0906 13:34:55.577836  8316 net.cpp:193] pool5 needs backward computation.
+I0906 13:34:55.577842  8316 net.cpp:193] relu5 needs backward computation.
+I0906 13:34:55.577847  8316 net.cpp:193] conv5 needs backward computation.
+I0906 13:34:55.577853  8316 net.cpp:193] relu4 needs backward computation.
+I0906 13:34:55.577859  8316 net.cpp:193] conv4 needs backward computation.
+I0906 13:34:55.577864  8316 net.cpp:193] relu3 needs backward computation.
+I0906 13:34:55.577870  8316 net.cpp:193] conv3 needs backward computation.
+I0906 13:34:55.577877  8316 net.cpp:193] pool2 needs backward computation.
+I0906 13:34:55.577883  8316 net.cpp:193] norm2 needs backward computation.
+I0906 13:34:55.577888  8316 net.cpp:193] relu2 needs backward computation.
+I0906 13:34:55.577893  8316 net.cpp:193] conv2 needs backward computation.
+I0906 13:34:55.577899  8316 net.cpp:193] pool1 needs backward computation.
+I0906 13:34:55.577905  8316 net.cpp:193] norm1 needs backward computation.
+I0906 13:34:55.577911  8316 net.cpp:193] relu1 needs backward computation.
+I0906 13:34:55.577916  8316 net.cpp:193] conv1 needs backward computation.
+I0906 13:34:55.577924  8316 net.cpp:195] label_data_1_split does not need backward computation.
+I0906 13:34:55.577931  8316 net.cpp:195] data does not need backward computation.
+I0906 13:34:55.577936  8316 net.cpp:236] This network produces output accuracy
+I0906 13:34:55.577942  8316 net.cpp:236] This network produces output loss
+I0906 13:34:55.577977  8316 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:34:55.577991  8316 net.cpp:248] Network initialization done.
+I0906 13:34:55.577996  8316 net.cpp:249] Memory required for data: 414724408
+I0906 13:34:55.578182  8316 solver.cpp:53] Solver scaffolding done.
+I0906 13:34:55.578306  8316 solver.cpp:270] Solving AlexNet
+I0906 13:34:55.578330  8316 solver.cpp:271] Learning Rate Policy: step
+I0906 13:34:55.580096  8316 solver.cpp:314] Iteration 0, Testing net (#0)
+I0906 13:34:55.580111  8316 net.cpp:696] Copying source layer data
+I0906 13:34:55.580116  8316 net.cpp:696] Copying source layer conv1
+I0906 13:34:55.583168  8316 net.cpp:696] Copying source layer relu1
+I0906 13:34:55.583199  8316 net.cpp:696] Copying source layer norm1
+I0906 13:34:55.583204  8316 net.cpp:696] Copying source layer pool1
+I0906 13:34:55.583209  8316 net.cpp:696] Copying source layer conv2
+I0906 13:34:55.583320  8316 net.cpp:696] Copying source layer relu2
+I0906 13:34:55.583326  8316 net.cpp:696] Copying source layer norm2
+I0906 13:34:55.583331  8316 net.cpp:696] Copying source layer pool2
+I0906 13:34:55.583335  8316 net.cpp:696] Copying source layer conv3
+I0906 13:34:55.583690  8316 net.cpp:696] Copying source layer relu3
+I0906 13:34:55.583698  8316 net.cpp:696] Copying source layer conv4
+I0906 13:34:55.583895  8316 net.cpp:696] Copying source layer relu4
+I0906 13:34:55.583902  8316 net.cpp:696] Copying source layer conv5
+I0906 13:34:55.584177  8316 net.cpp:696] Copying source layer relu5
+I0906 13:34:55.584185  8316 net.cpp:696] Copying source layer pool5
+I0906 13:34:55.584189  8316 net.cpp:696] Copying source layer fc6
+I0906 13:34:55.589432  8316 net.cpp:696] Copying source layer relu6
+I0906 13:34:55.589460  8316 net.cpp:696] Copying source layer fc7
+I0906 13:34:55.592273  8316 net.cpp:696] Copying source layer relu7
+I0906 13:34:55.592288  8316 net.cpp:696] Copying source layer fc8
+I0906 13:34:55.593138  8316 net.cpp:696] Copying source layer loss
+I0906 13:34:55.593260  8316 base_data_layer.cpp:89] Thread joined
+I0906 13:34:55.597589  8316 base_data_layer.cpp:93] Prefetch copied
+I0906 13:34:55.597887  8316 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:34:55.695569  8322 data_layer.cpp:120] Prefetch batch: 97 ms.
+I0906 13:34:55.695600  8322 data_layer.cpp:121]      Read time: 13.209 ms.
+I0906 13:34:55.695606  8322 data_layer.cpp:122] Transform time: 83.025 ms.
+I0906 13:34:58.623245  8316 solver.cpp:363]     Test net output #0: accuracy = 0
+I0906 13:34:58.623273  8316 solver.cpp:363]     Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss)
+I0906 13:34:58.623322  8316 base_data_layer.cpp:89] Thread joined
+I0906 13:34:58.632244  8316 base_data_layer.cpp:93] Prefetch copied
+I0906 13:34:58.632606  8316 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:34:58.819707  8323 data_layer.cpp:120] Prefetch batch: 186 ms.
+I0906 13:34:58.819741  8323 data_layer.cpp:121]      Read time: 24.148 ms.
+I0906 13:34:58.819747  8323 data_layer.cpp:122] Transform time: 161.152 ms.
+I0906 13:35:05.407784  8316 solver.cpp:234] Iteration 0, loss = 0
+I0906 13:35:05.407842  8316 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
+I0906 13:35:05.407891  8316 solver.cpp:506] Iteration 0, lr = 0.01
+I0906 13:35:05.525874  8316 base_data_layer.cpp:89] Thread joined
+I0906 13:35:05.533869  8316 base_data_layer.cpp:93] Prefetch copied
+I0906 13:35:05.534140  8316 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:35:05.722632  8328 data_layer.cpp:120] Prefetch batch: 188 ms.
+I0906 13:35:05.722664  8328 data_layer.cpp:121]      Read time: 24.184 ms.
+I0906 13:35:05.722672  8328 data_layer.cpp:122] Transform time: 162.257 ms.
+I0906 13:35:08.300590  8316 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
new file mode 100644
index 00000000..6ec81c82
--- /dev/null
+++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
@@ -0,0 +1,1160 @@
+Log file created at: 2015/09/06 13:58:05
+Running on machine: AMD-RESEARCH
+Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
+I0906 13:58:05.835170 16515 caffe.cpp:114] Use GPU with device ID 0
+I0906 13:58:05.875704 16515 device.cpp:230] Number of platforms found:1
+I0906 13:58:05.875743 16515 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
+I0906 13:58:05.875757 16515 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
+I0906 13:58:05.875763 16515 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
+I0906 13:58:05.875769 16515 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
+I0906 13:58:05.875774 16515 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
+I0906 13:58:05.875783 16515 device.cpp:286] Number of devices found:1
+I0906 13:58:05.875788 16515 device.cpp:288] 	DeviceID:	0x18ab2f0
+I0906 13:58:05.875809 16515 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
+I0906 13:58:05.875818 16515 device.cpp:393] 	Is it integrated GPU?:	0
+I0906 13:58:05.875823 16515 device.cpp:393] 	Max clock frequency MHz:	930
+I0906 13:58:05.875829 16515 device.cpp:393] 	Host-Device unified mem:	0
+I0906 13:58:05.875834 16515 device.cpp:393] 	ECC support:	0
+I0906 13:58:05.875839 16515 device.cpp:393] 	Endian little:	1
+I0906 13:58:05.875844 16515 device.cpp:393] 	Max compute units:	44
+I0906 13:58:05.875849 16515 device.cpp:393] 	Max work group size:	256
+I0906 13:58:05.875856 16515 device.cpp:393] 	Max work item dimensions:	3
+I0906 13:58:05.875862 16515 device.cpp:393] 	Max work item sizes:	0x100
+I0906 13:58:05.875869 16515 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
+I0906 13:58:05.875875 16515 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
+I0906 13:58:05.875881 16515 device.cpp:393] 	Max mem alloc size:	4244635648
+I0906 13:58:05.875886 16515 device.cpp:393] 	Global mem size:	16878927872
+I0906 13:58:05.875891 16515 device.cpp:393] 	Local mem size:	32768
+I0906 13:58:05.875902 16515 device.cpp:96] Picked device type : GPU 0
+I0906 13:58:08.267483 16515 device.cpp:152] Build Program
+I0906 13:58:08.267706 16515 caffe.cpp:122] Starting Optimization
+I0906 13:58:08.267797 16515 solver.cpp:40] Initializing solver from parameters: 
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+display: 1
+max_iter: 10
+lr_policy: "step"
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+stepsize: 100000
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+I0906 13:58:08.267910 16515 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:58:08.269042 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
+I0906 13:58:08.269093 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
+I0906 13:58:08.269273 16515 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TRAIN
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:58:08.269708 16515 net.cpp:68] Memory required for data: 0
+I0906 13:58:08.269917 16515 layer_factory.hpp:74] Creating layer data
+I0906 13:58:08.269971 16515 net.cpp:91] Creating Layer data
+I0906 13:58:08.269992 16515 net.cpp:369] data -> data
+I0906 13:58:08.270097 16515 net.cpp:369] data -> label
+I0906 13:58:08.270122 16515 net.cpp:121] Setting up data
+I0906 13:58:08.270134 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:58:08.279337 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
+I0906 13:58:08.279680 16515 data_layer.cpp:53] output data size: 100,3,227,227
+I0906 13:58:08.311036 16515 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:58:08.311240 16515 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:58:08.311303 16515 net.cpp:128] Top shape: 100 3 227 227 (15458700)
+I0906 13:58:08.311313 16515 net.cpp:128] Top shape: 100 (100)
+I0906 13:58:08.311318 16515 net.cpp:134] Memory required for data: 61835200
+I0906 13:58:08.311352 16515 layer_factory.hpp:74] Creating layer conv1
+I0906 13:58:08.311431 16515 net.cpp:91] Creating Layer conv1
+I0906 13:58:08.311453 16515 net.cpp:411] conv1 <- data
+I0906 13:58:08.311504 16515 net.cpp:369] conv1 -> conv1
+I0906 13:58:08.311569 16515 net.cpp:121] Setting up conv1
+I0906 13:58:08.316509 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:08.316515 16515 net.cpp:134] Memory required for data: 177995200
+I0906 13:58:08.316555 16515 layer_factory.hpp:74] Creating layer relu1
+I0906 13:58:08.316577 16515 net.cpp:91] Creating Layer relu1
+I0906 13:58:08.316583 16515 net.cpp:411] relu1 <- conv1
+I0906 13:58:08.316597 16515 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:58:08.316606 16515 net.cpp:121] Setting up relu1
+I0906 13:58:08.316615 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:08.316619 16515 net.cpp:134] Memory required for data: 294155200
+I0906 13:58:08.316623 16515 layer_factory.hpp:74] Creating layer norm1
+I0906 13:58:08.316653 16515 net.cpp:91] Creating Layer norm1
+I0906 13:58:08.316659 16515 net.cpp:411] norm1 <- conv1
+I0906 13:58:08.316673 16515 net.cpp:369] norm1 -> norm1
+I0906 13:58:08.316686 16515 net.cpp:121] Setting up norm1
+I0906 13:58:08.316710 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:08.316715 16515 net.cpp:134] Memory required for data: 410315200
+I0906 13:58:08.316720 16515 layer_factory.hpp:74] Creating layer pool1
+I0906 13:58:08.316745 16515 net.cpp:91] Creating Layer pool1
+I0906 13:58:08.316750 16515 net.cpp:411] pool1 <- norm1
+I0906 13:58:08.316763 16515 net.cpp:369] pool1 -> pool1
+I0906 13:58:08.316776 16515 net.cpp:121] Setting up pool1
+I0906 13:58:08.316805 16515 net.cpp:128] Top shape: 100 96 27 27 (6998400)
+I0906 13:58:08.316809 16515 net.cpp:134] Memory required for data: 438308800
+I0906 13:58:08.316814 16515 layer_factory.hpp:74] Creating layer conv2
+I0906 13:58:08.316829 16515 net.cpp:91] Creating Layer conv2
+I0906 13:58:08.316834 16515 net.cpp:411] conv2 <- pool1
+I0906 13:58:08.316850 16515 net.cpp:369] conv2 -> conv2
+I0906 13:58:08.316862 16515 net.cpp:121] Setting up conv2
+I0906 13:58:08.356899 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:08.356914 16515 net.cpp:134] Memory required for data: 512958400
+I0906 13:58:08.356945 16515 layer_factory.hpp:74] Creating layer relu2
+I0906 13:58:08.356967 16515 net.cpp:91] Creating Layer relu2
+I0906 13:58:08.356978 16515 net.cpp:411] relu2 <- conv2
+I0906 13:58:08.356998 16515 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:58:08.357012 16515 net.cpp:121] Setting up relu2
+I0906 13:58:08.357022 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:08.357025 16515 net.cpp:134] Memory required for data: 587608000
+I0906 13:58:08.357030 16515 layer_factory.hpp:74] Creating layer norm2
+I0906 13:58:08.357046 16515 net.cpp:91] Creating Layer norm2
+I0906 13:58:08.357053 16515 net.cpp:411] norm2 <- conv2
+I0906 13:58:08.357066 16515 net.cpp:369] norm2 -> norm2
+I0906 13:58:08.357079 16515 net.cpp:121] Setting up norm2
+I0906 13:58:08.357108 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:08.357113 16515 net.cpp:134] Memory required for data: 662257600
+I0906 13:58:08.357118 16515 layer_factory.hpp:74] Creating layer pool2
+I0906 13:58:08.357146 16515 net.cpp:91] Creating Layer pool2
+I0906 13:58:08.357152 16515 net.cpp:411] pool2 <- norm2
+I0906 13:58:08.357166 16515 net.cpp:369] pool2 -> pool2
+I0906 13:58:08.357177 16515 net.cpp:121] Setting up pool2
+I0906 13:58:08.357200 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:08.357204 16515 net.cpp:134] Memory required for data: 679563200
+I0906 13:58:08.357259 16515 layer_factory.hpp:74] Creating layer conv3
+I0906 13:58:08.357281 16515 net.cpp:91] Creating Layer conv3
+I0906 13:58:08.357287 16515 net.cpp:411] conv3 <- pool2
+I0906 13:58:08.357303 16515 net.cpp:369] conv3 -> conv3
+I0906 13:58:08.357318 16515 net.cpp:121] Setting up conv3
+I0906 13:58:08.475977 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:08.475999 16515 net.cpp:134] Memory required for data: 705521600
+I0906 13:58:08.476043 16515 layer_factory.hpp:74] Creating layer relu3
+I0906 13:58:08.476078 16515 net.cpp:91] Creating Layer relu3
+I0906 13:58:08.476093 16515 net.cpp:411] relu3 <- conv3
+I0906 13:58:08.476120 16515 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:58:08.476137 16515 net.cpp:121] Setting up relu3
+I0906 13:58:08.476147 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:08.476151 16515 net.cpp:134] Memory required for data: 731480000
+I0906 13:58:08.476156 16515 layer_factory.hpp:74] Creating layer conv4
+I0906 13:58:08.476184 16515 net.cpp:91] Creating Layer conv4
+I0906 13:58:08.476191 16515 net.cpp:411] conv4 <- conv3
+I0906 13:58:08.476207 16515 net.cpp:369] conv4 -> conv4
+I0906 13:58:08.476222 16515 net.cpp:121] Setting up conv4
+I0906 13:58:08.500998 16519 data_layer.cpp:120] Prefetch batch: 189 ms.
+I0906 13:58:08.501045 16519 data_layer.cpp:121]      Read time: 23.893 ms.
+I0906 13:58:08.501054 16519 data_layer.cpp:122] Transform time: 163.51 ms.
+I0906 13:58:08.563753 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:08.563774 16515 net.cpp:134] Memory required for data: 757438400
+I0906 13:58:08.563802 16515 layer_factory.hpp:74] Creating layer relu4
+I0906 13:58:08.563835 16515 net.cpp:91] Creating Layer relu4
+I0906 13:58:08.563849 16515 net.cpp:411] relu4 <- conv4
+I0906 13:58:08.563876 16515 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:58:08.563892 16515 net.cpp:121] Setting up relu4
+I0906 13:58:08.563902 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:08.563906 16515 net.cpp:134] Memory required for data: 783396800
+I0906 13:58:08.563911 16515 layer_factory.hpp:74] Creating layer conv5
+I0906 13:58:08.563946 16515 net.cpp:91] Creating Layer conv5
+I0906 13:58:08.563951 16515 net.cpp:411] conv5 <- conv4
+I0906 13:58:08.563968 16515 net.cpp:369] conv5 -> conv5
+I0906 13:58:08.563982 16515 net.cpp:121] Setting up conv5
+I0906 13:58:08.621495 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:08.621512 16515 net.cpp:134] Memory required for data: 800702400
+I0906 13:58:08.621553 16515 layer_factory.hpp:74] Creating layer relu5
+I0906 13:58:08.621584 16515 net.cpp:91] Creating Layer relu5
+I0906 13:58:08.621598 16515 net.cpp:411] relu5 <- conv5
+I0906 13:58:08.621623 16515 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:58:08.621639 16515 net.cpp:121] Setting up relu5
+I0906 13:58:08.621649 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:08.621652 16515 net.cpp:134] Memory required for data: 818008000
+I0906 13:58:08.621657 16515 layer_factory.hpp:74] Creating layer pool5
+I0906 13:58:08.621677 16515 net.cpp:91] Creating Layer pool5
+I0906 13:58:08.621683 16515 net.cpp:411] pool5 <- conv5
+I0906 13:58:08.621697 16515 net.cpp:369] pool5 -> pool5
+I0906 13:58:08.621711 16515 net.cpp:121] Setting up pool5
+I0906 13:58:08.621732 16515 net.cpp:128] Top shape: 100 256 6 6 (921600)
+I0906 13:58:08.621737 16515 net.cpp:134] Memory required for data: 821694400
+I0906 13:58:08.621742 16515 layer_factory.hpp:74] Creating layer fc6
+I0906 13:58:08.621778 16515 net.cpp:91] Creating Layer fc6
+I0906 13:58:08.621783 16515 net.cpp:411] fc6 <- pool5
+I0906 13:58:08.621798 16515 net.cpp:369] fc6 -> fc6
+I0906 13:58:08.621812 16515 net.cpp:121] Setting up fc6
+I0906 13:58:13.492439 16515 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:58:13.492465 16515 net.cpp:134] Memory required for data: 823332800
+I0906 13:58:13.492493 16515 layer_factory.hpp:74] Creating layer relu6
+I0906 13:58:13.492527 16515 net.cpp:91] Creating Layer relu6
+I0906 13:58:13.492542 16515 net.cpp:411] relu6 <- fc6
+I0906 13:58:13.492568 16515 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:58:13.492630 16515 net.cpp:121] Setting up relu6
+I0906 13:58:13.492640 16515 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:58:13.492643 16515 net.cpp:134] Memory required for data: 824971200
+I0906 13:58:13.492648 16515 layer_factory.hpp:74] Creating layer fc7
+I0906 13:58:13.492671 16515 net.cpp:91] Creating Layer fc7
+I0906 13:58:13.492677 16515 net.cpp:411] fc7 <- fc6
+I0906 13:58:13.492693 16515 net.cpp:369] fc7 -> fc7
+I0906 13:58:13.492708 16515 net.cpp:121] Setting up fc7
+I0906 13:58:15.661120 16515 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:58:15.661144 16515 net.cpp:134] Memory required for data: 826609600
+I0906 13:58:15.661171 16515 layer_factory.hpp:74] Creating layer relu7
+I0906 13:58:15.661205 16515 net.cpp:91] Creating Layer relu7
+I0906 13:58:15.661221 16515 net.cpp:411] relu7 <- fc7
+I0906 13:58:15.661247 16515 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:58:15.661263 16515 net.cpp:121] Setting up relu7
+I0906 13:58:15.661273 16515 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:58:15.661276 16515 net.cpp:134] Memory required for data: 828248000
+I0906 13:58:15.661281 16515 layer_factory.hpp:74] Creating layer fc8
+I0906 13:58:15.661304 16515 net.cpp:91] Creating Layer fc8
+I0906 13:58:15.661310 16515 net.cpp:411] fc8 <- fc7
+I0906 13:58:15.661325 16515 net.cpp:369] fc8 -> fc8
+I0906 13:58:15.661340 16515 net.cpp:121] Setting up fc8
+I0906 13:58:16.190832 16515 net.cpp:128] Top shape: 100 1000 (100000)
+I0906 13:58:16.190855 16515 net.cpp:134] Memory required for data: 828648000
+I0906 13:58:16.190881 16515 layer_factory.hpp:74] Creating layer loss
+I0906 13:58:16.190932 16515 net.cpp:91] Creating Layer loss
+I0906 13:58:16.190946 16515 net.cpp:411] loss <- fc8
+I0906 13:58:16.190969 16515 net.cpp:411] loss <- label
+I0906 13:58:16.190989 16515 net.cpp:369] loss -> loss
+I0906 13:58:16.191009 16515 net.cpp:121] Setting up loss
+I0906 13:58:16.191030 16515 layer_factory.hpp:74] Creating layer loss
+I0906 13:58:16.191588 16515 net.cpp:128] Top shape: (1)
+I0906 13:58:16.191593 16515 net.cpp:130]     with loss weight 1
+I0906 13:58:16.191611 16515 net.cpp:134] Memory required for data: 828648004
+I0906 13:58:16.191619 16515 net.cpp:193] loss needs backward computation.
+I0906 13:58:16.191627 16515 net.cpp:193] fc8 needs backward computation.
+I0906 13:58:16.191633 16515 net.cpp:193] relu7 needs backward computation.
+I0906 13:58:16.191639 16515 net.cpp:193] fc7 needs backward computation.
+I0906 13:58:16.191644 16515 net.cpp:193] relu6 needs backward computation.
+I0906 13:58:16.191650 16515 net.cpp:193] fc6 needs backward computation.
+I0906 13:58:16.191655 16515 net.cpp:193] pool5 needs backward computation.
+I0906 13:58:16.191661 16515 net.cpp:193] relu5 needs backward computation.
+I0906 13:58:16.191666 16515 net.cpp:193] conv5 needs backward computation.
+I0906 13:58:16.191673 16515 net.cpp:193] relu4 needs backward computation.
+I0906 13:58:16.191678 16515 net.cpp:193] conv4 needs backward computation.
+I0906 13:58:16.191684 16515 net.cpp:193] relu3 needs backward computation.
+I0906 13:58:16.191689 16515 net.cpp:193] conv3 needs backward computation.
+I0906 13:58:16.191696 16515 net.cpp:193] pool2 needs backward computation.
+I0906 13:58:16.191702 16515 net.cpp:193] norm2 needs backward computation.
+I0906 13:58:16.191709 16515 net.cpp:193] relu2 needs backward computation.
+I0906 13:58:16.191714 16515 net.cpp:193] conv2 needs backward computation.
+I0906 13:58:16.191720 16515 net.cpp:193] pool1 needs backward computation.
+I0906 13:58:16.191725 16515 net.cpp:193] norm1 needs backward computation.
+I0906 13:58:16.191731 16515 net.cpp:193] relu1 needs backward computation.
+I0906 13:58:16.191737 16515 net.cpp:193] conv1 needs backward computation.
+I0906 13:58:16.191745 16515 net.cpp:195] data does not need backward computation.
+I0906 13:58:16.191753 16515 net.cpp:236] This network produces output loss
+I0906 13:58:16.191787 16515 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:58:16.191803 16515 net.cpp:248] Network initialization done.
+I0906 13:58:16.191807 16515 net.cpp:249] Memory required for data: 828648004
+I0906 13:58:16.192769 16515 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:58:16.192881 16515 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
+I0906 13:58:16.193114 16515 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TEST
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:58:16.193480 16515 net.cpp:68] Memory required for data: 0
+I0906 13:58:16.193527 16515 layer_factory.hpp:74] Creating layer data
+I0906 13:58:16.193549 16515 net.cpp:91] Creating Layer data
+I0906 13:58:16.193559 16515 net.cpp:369] data -> data
+I0906 13:58:16.193583 16515 net.cpp:369] data -> label
+I0906 13:58:16.193595 16515 net.cpp:121] Setting up data
+I0906 13:58:16.193603 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:58:16.202100 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
+I0906 13:58:16.202343 16515 data_layer.cpp:53] output data size: 50,3,227,227
+I0906 13:58:16.219017 16515 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:58:16.219137 16515 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:58:16.219171 16515 net.cpp:128] Top shape: 50 3 227 227 (7729350)
+I0906 13:58:16.219179 16515 net.cpp:128] Top shape: 50 (50)
+I0906 13:58:16.219183 16515 net.cpp:134] Memory required for data: 30917600
+I0906 13:58:16.219214 16515 layer_factory.hpp:74] Creating layer label_data_1_split
+I0906 13:58:16.219279 16515 net.cpp:91] Creating Layer label_data_1_split
+I0906 13:58:16.219293 16515 net.cpp:411] label_data_1_split <- label
+I0906 13:58:16.219367 16515 net.cpp:369] label_data_1_split -> label_data_1_split_0
+I0906 13:58:16.219409 16515 net.cpp:369] label_data_1_split -> label_data_1_split_1
+I0906 13:58:16.219420 16515 net.cpp:121] Setting up label_data_1_split
+I0906 13:58:16.219455 16515 net.cpp:128] Top shape: 50 (50)
+I0906 13:58:16.219462 16515 net.cpp:128] Top shape: 50 (50)
+I0906 13:58:16.219466 16515 net.cpp:134] Memory required for data: 30918000
+I0906 13:58:16.219471 16515 layer_factory.hpp:74] Creating layer conv1
+I0906 13:58:16.219508 16515 net.cpp:91] Creating Layer conv1
+I0906 13:58:16.219513 16515 net.cpp:411] conv1 <- data
+I0906 13:58:16.219530 16515 net.cpp:369] conv1 -> conv1
+I0906 13:58:16.219545 16515 net.cpp:121] Setting up conv1
+I0906 13:58:16.224315 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:58:16.224321 16515 net.cpp:134] Memory required for data: 88998000
+I0906 13:58:16.224341 16515 layer_factory.hpp:74] Creating layer relu1
+I0906 13:58:16.224354 16515 net.cpp:91] Creating Layer relu1
+I0906 13:58:16.224360 16515 net.cpp:411] relu1 <- conv1
+I0906 13:58:16.224372 16515 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:58:16.224382 16515 net.cpp:121] Setting up relu1
+I0906 13:58:16.224390 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:58:16.224393 16515 net.cpp:134] Memory required for data: 147078000
+I0906 13:58:16.224398 16515 layer_factory.hpp:74] Creating layer norm1
+I0906 13:58:16.224417 16515 net.cpp:91] Creating Layer norm1
+I0906 13:58:16.224423 16515 net.cpp:411] norm1 <- conv1
+I0906 13:58:16.224436 16515 net.cpp:369] norm1 -> norm1
+I0906 13:58:16.224447 16515 net.cpp:121] Setting up norm1
+I0906 13:58:16.224465 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:58:16.224508 16515 net.cpp:134] Memory required for data: 205158000
+I0906 13:58:16.224514 16515 layer_factory.hpp:74] Creating layer pool1
+I0906 13:58:16.224529 16515 net.cpp:91] Creating Layer pool1
+I0906 13:58:16.224534 16515 net.cpp:411] pool1 <- norm1
+I0906 13:58:16.224547 16515 net.cpp:369] pool1 -> pool1
+I0906 13:58:16.224558 16515 net.cpp:121] Setting up pool1
+I0906 13:58:16.224576 16515 net.cpp:128] Top shape: 50 96 27 27 (3499200)
+I0906 13:58:16.224581 16515 net.cpp:134] Memory required for data: 219154800
+I0906 13:58:16.224586 16515 layer_factory.hpp:74] Creating layer conv2
+I0906 13:58:16.224601 16515 net.cpp:91] Creating Layer conv2
+I0906 13:58:16.224606 16515 net.cpp:411] conv2 <- pool1
+I0906 13:58:16.224620 16515 net.cpp:369] conv2 -> conv2
+I0906 13:58:16.224632 16515 net.cpp:121] Setting up conv2
+I0906 13:58:16.264878 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:58:16.264889 16515 net.cpp:134] Memory required for data: 256479600
+I0906 13:58:16.264916 16515 layer_factory.hpp:74] Creating layer relu2
+I0906 13:58:16.264937 16515 net.cpp:91] Creating Layer relu2
+I0906 13:58:16.264946 16515 net.cpp:411] relu2 <- conv2
+I0906 13:58:16.264966 16515 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:58:16.264978 16515 net.cpp:121] Setting up relu2
+I0906 13:58:16.264987 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:58:16.264991 16515 net.cpp:134] Memory required for data: 293804400
+I0906 13:58:16.264997 16515 layer_factory.hpp:74] Creating layer norm2
+I0906 13:58:16.265015 16515 net.cpp:91] Creating Layer norm2
+I0906 13:58:16.265022 16515 net.cpp:411] norm2 <- conv2
+I0906 13:58:16.265035 16515 net.cpp:369] norm2 -> norm2
+I0906 13:58:16.265050 16515 net.cpp:121] Setting up norm2
+I0906 13:58:16.265072 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:58:16.265077 16515 net.cpp:134] Memory required for data: 331129200
+I0906 13:58:16.265082 16515 layer_factory.hpp:74] Creating layer pool2
+I0906 13:58:16.265097 16515 net.cpp:91] Creating Layer pool2
+I0906 13:58:16.265103 16515 net.cpp:411] pool2 <- norm2
+I0906 13:58:16.265116 16515 net.cpp:369] pool2 -> pool2
+I0906 13:58:16.265127 16515 net.cpp:121] Setting up pool2
+I0906 13:58:16.265149 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:58:16.265153 16515 net.cpp:134] Memory required for data: 339782000
+I0906 13:58:16.265158 16515 layer_factory.hpp:74] Creating layer conv3
+I0906 13:58:16.265179 16515 net.cpp:91] Creating Layer conv3
+I0906 13:58:16.265184 16515 net.cpp:411] conv3 <- pool2
+I0906 13:58:16.265200 16515 net.cpp:369] conv3 -> conv3
+I0906 13:58:16.265213 16515 net.cpp:121] Setting up conv3
+I0906 13:58:16.312928 16520 data_layer.cpp:120] Prefetch batch: 93 ms.
+I0906 13:58:16.312959 16520 data_layer.cpp:121]      Read time: 12.075 ms.
+I0906 13:58:16.312966 16520 data_layer.cpp:122] Transform time: 80.513 ms.
+I0906 13:58:16.381564 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:58:16.381587 16515 net.cpp:134] Memory required for data: 352761200
+I0906 13:58:16.381628 16515 layer_factory.hpp:74] Creating layer relu3
+I0906 13:58:16.381660 16515 net.cpp:91] Creating Layer relu3
+I0906 13:58:16.381675 16515 net.cpp:411] relu3 <- conv3
+I0906 13:58:16.381700 16515 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:58:16.381717 16515 net.cpp:121] Setting up relu3
+I0906 13:58:16.381726 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:58:16.381731 16515 net.cpp:134] Memory required for data: 365740400
+I0906 13:58:16.381734 16515 layer_factory.hpp:74] Creating layer conv4
+I0906 13:58:16.381762 16515 net.cpp:91] Creating Layer conv4
+I0906 13:58:16.381767 16515 net.cpp:411] conv4 <- conv3
+I0906 13:58:16.381783 16515 net.cpp:369] conv4 -> conv4
+I0906 13:58:16.381798 16515 net.cpp:121] Setting up conv4
+I0906 13:58:16.468471 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:58:16.468492 16515 net.cpp:134] Memory required for data: 378719600
+I0906 13:58:16.468518 16515 layer_factory.hpp:74] Creating layer relu4
+I0906 13:58:16.468550 16515 net.cpp:91] Creating Layer relu4
+I0906 13:58:16.468605 16515 net.cpp:411] relu4 <- conv4
+I0906 13:58:16.468633 16515 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:58:16.468649 16515 net.cpp:121] Setting up relu4
+I0906 13:58:16.468658 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:58:16.468662 16515 net.cpp:134] Memory required for data: 391698800
+I0906 13:58:16.468667 16515 layer_factory.hpp:74] Creating layer conv5
+I0906 13:58:16.468694 16515 net.cpp:91] Creating Layer conv5
+I0906 13:58:16.468700 16515 net.cpp:411] conv5 <- conv4
+I0906 13:58:16.468716 16515 net.cpp:369] conv5 -> conv5
+I0906 13:58:16.468731 16515 net.cpp:121] Setting up conv5
+I0906 13:58:16.526487 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:58:16.526507 16515 net.cpp:134] Memory required for data: 400351600
+I0906 13:58:16.526547 16515 layer_factory.hpp:74] Creating layer relu5
+I0906 13:58:16.526577 16515 net.cpp:91] Creating Layer relu5
+I0906 13:58:16.526590 16515 net.cpp:411] relu5 <- conv5
+I0906 13:58:16.526614 16515 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:58:16.526630 16515 net.cpp:121] Setting up relu5
+I0906 13:58:16.526639 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:58:16.526643 16515 net.cpp:134] Memory required for data: 409004400
+I0906 13:58:16.526648 16515 layer_factory.hpp:74] Creating layer pool5
+I0906 13:58:16.526676 16515 net.cpp:91] Creating Layer pool5
+I0906 13:58:16.526682 16515 net.cpp:411] pool5 <- conv5
+I0906 13:58:16.526696 16515 net.cpp:369] pool5 -> pool5
+I0906 13:58:16.526710 16515 net.cpp:121] Setting up pool5
+I0906 13:58:16.526731 16515 net.cpp:128] Top shape: 50 256 6 6 (460800)
+I0906 13:58:16.526734 16515 net.cpp:134] Memory required for data: 410847600
+I0906 13:58:16.526739 16515 layer_factory.hpp:74] Creating layer fc6
+I0906 13:58:16.526762 16515 net.cpp:91] Creating Layer fc6
+I0906 13:58:16.526767 16515 net.cpp:411] fc6 <- pool5
+I0906 13:58:16.526782 16515 net.cpp:369] fc6 -> fc6
+I0906 13:58:16.526794 16515 net.cpp:121] Setting up fc6
+I0906 13:58:21.365124 16515 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:58:21.365149 16515 net.cpp:134] Memory required for data: 411666800
+I0906 13:58:21.365176 16515 layer_factory.hpp:74] Creating layer relu6
+I0906 13:58:21.365211 16515 net.cpp:91] Creating Layer relu6
+I0906 13:58:21.365226 16515 net.cpp:411] relu6 <- fc6
+I0906 13:58:21.365250 16515 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:58:21.365267 16515 net.cpp:121] Setting up relu6
+I0906 13:58:21.365277 16515 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:58:21.365280 16515 net.cpp:134] Memory required for data: 412486000
+I0906 13:58:21.365285 16515 layer_factory.hpp:74] Creating layer fc7
+I0906 13:58:21.365309 16515 net.cpp:91] Creating Layer fc7
+I0906 13:58:21.365314 16515 net.cpp:411] fc7 <- fc6
+I0906 13:58:21.365330 16515 net.cpp:369] fc7 -> fc7
+I0906 13:58:21.365345 16515 net.cpp:121] Setting up fc7
+I0906 13:58:23.510701 16515 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:58:23.510725 16515 net.cpp:134] Memory required for data: 413305200
+I0906 13:58:23.510752 16515 layer_factory.hpp:74] Creating layer relu7
+I0906 13:58:23.510785 16515 net.cpp:91] Creating Layer relu7
+I0906 13:58:23.510800 16515 net.cpp:411] relu7 <- fc7
+I0906 13:58:23.510828 16515 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:58:23.510844 16515 net.cpp:121] Setting up relu7
+I0906 13:58:23.510854 16515 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:58:23.510857 16515 net.cpp:134] Memory required for data: 414124400
+I0906 13:58:23.510862 16515 layer_factory.hpp:74] Creating layer fc8
+I0906 13:58:23.510885 16515 net.cpp:91] Creating Layer fc8
+I0906 13:58:23.510890 16515 net.cpp:411] fc8 <- fc7
+I0906 13:58:23.510906 16515 net.cpp:369] fc8 -> fc8
+I0906 13:58:23.510932 16515 net.cpp:121] Setting up fc8
+I0906 13:58:24.034812 16515 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:58:24.034833 16515 net.cpp:134] Memory required for data: 414324400
+I0906 13:58:24.034860 16515 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
+I0906 13:58:24.034893 16515 net.cpp:91] Creating Layer fc8_fc8_0_split
+I0906 13:58:24.034958 16515 net.cpp:411] fc8_fc8_0_split <- fc8
+I0906 13:58:24.034988 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
+I0906 13:58:24.035012 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
+I0906 13:58:24.035023 16515 net.cpp:121] Setting up fc8_fc8_0_split
+I0906 13:58:24.035040 16515 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:58:24.035046 16515 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:58:24.035050 16515 net.cpp:134] Memory required for data: 414724400
+I0906 13:58:24.035055 16515 layer_factory.hpp:74] Creating layer accuracy
+I0906 13:58:24.035086 16515 net.cpp:91] Creating Layer accuracy
+I0906 13:58:24.035092 16515 net.cpp:411] accuracy <- fc8_fc8_0_split_0
+I0906 13:58:24.035104 16515 net.cpp:411] accuracy <- label_data_1_split_0
+I0906 13:58:24.035115 16515 net.cpp:369] accuracy -> accuracy
+I0906 13:58:24.035126 16515 net.cpp:121] Setting up accuracy
+I0906 13:58:24.035143 16515 net.cpp:128] Top shape: (1)
+I0906 13:58:24.035147 16515 net.cpp:134] Memory required for data: 414724404
+I0906 13:58:24.035152 16515 layer_factory.hpp:74] Creating layer loss
+I0906 13:58:24.035163 16515 net.cpp:91] Creating Layer loss
+I0906 13:58:24.035168 16515 net.cpp:411] loss <- fc8_fc8_0_split_1
+I0906 13:58:24.035179 16515 net.cpp:411] loss <- label_data_1_split_1
+I0906 13:58:24.035190 16515 net.cpp:369] loss -> loss
+I0906 13:58:24.035202 16515 net.cpp:121] Setting up loss
+I0906 13:58:24.035212 16515 layer_factory.hpp:74] Creating layer loss
+I0906 13:58:24.035562 16515 net.cpp:128] Top shape: (1)
+I0906 13:58:24.035567 16515 net.cpp:130]     with loss weight 1
+I0906 13:58:24.035583 16515 net.cpp:134] Memory required for data: 414724408
+I0906 13:58:24.035591 16515 net.cpp:193] loss needs backward computation.
+I0906 13:58:24.035598 16515 net.cpp:195] accuracy does not need backward computation.
+I0906 13:58:24.035605 16515 net.cpp:193] fc8_fc8_0_split needs backward computation.
+I0906 13:58:24.035610 16515 net.cpp:193] fc8 needs backward computation.
+I0906 13:58:24.035616 16515 net.cpp:193] relu7 needs backward computation.
+I0906 13:58:24.035621 16515 net.cpp:193] fc7 needs backward computation.
+I0906 13:58:24.035627 16515 net.cpp:193] relu6 needs backward computation.
+I0906 13:58:24.035634 16515 net.cpp:193] fc6 needs backward computation.
+I0906 13:58:24.035640 16515 net.cpp:193] pool5 needs backward computation.
+I0906 13:58:24.035645 16515 net.cpp:193] relu5 needs backward computation.
+I0906 13:58:24.035651 16515 net.cpp:193] conv5 needs backward computation.
+I0906 13:58:24.035656 16515 net.cpp:193] relu4 needs backward computation.
+I0906 13:58:24.035662 16515 net.cpp:193] conv4 needs backward computation.
+I0906 13:58:24.035668 16515 net.cpp:193] relu3 needs backward computation.
+I0906 13:58:24.035673 16515 net.cpp:193] conv3 needs backward computation.
+I0906 13:58:24.035679 16515 net.cpp:193] pool2 needs backward computation.
+I0906 13:58:24.035686 16515 net.cpp:193] norm2 needs backward computation.
+I0906 13:58:24.035692 16515 net.cpp:193] relu2 needs backward computation.
+I0906 13:58:24.035697 16515 net.cpp:193] conv2 needs backward computation.
+I0906 13:58:24.035703 16515 net.cpp:193] pool1 needs backward computation.
+I0906 13:58:24.035709 16515 net.cpp:193] norm1 needs backward computation.
+I0906 13:58:24.035715 16515 net.cpp:193] relu1 needs backward computation.
+I0906 13:58:24.035720 16515 net.cpp:193] conv1 needs backward computation.
+I0906 13:58:24.035727 16515 net.cpp:195] label_data_1_split does not need backward computation.
+I0906 13:58:24.035734 16515 net.cpp:195] data does not need backward computation.
+I0906 13:58:24.035739 16515 net.cpp:236] This network produces output accuracy
+I0906 13:58:24.035745 16515 net.cpp:236] This network produces output loss
+I0906 13:58:24.035781 16515 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:58:24.035796 16515 net.cpp:248] Network initialization done.
+I0906 13:58:24.035799 16515 net.cpp:249] Memory required for data: 414724408
+I0906 13:58:24.036000 16515 solver.cpp:53] Solver scaffolding done.
+I0906 13:58:24.036130 16515 solver.cpp
\ No newline at end of file
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
new file mode 100644
index 00000000..d142f7c0
--- /dev/null
+++ b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
@@ -0,0 +1,1208 @@
+Log file created at: 2015/09/06 13:58:55
+Running on machine: AMD-RESEARCH
+Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
+I0906 13:58:55.707435 16537 caffe.cpp:114] Use GPU with device ID 0
+I0906 13:58:55.745967 16537 device.cpp:230] Number of platforms found:1
+I0906 13:58:55.746011 16537 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
+I0906 13:58:55.746028 16537 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
+I0906 13:58:55.746036 16537 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
+I0906 13:58:55.746042 16537 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
+I0906 13:58:55.746048 16537 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
+I0906 13:58:55.746059 16537 device.cpp:286] Number of devices found:1
+I0906 13:58:55.746064 16537 device.cpp:288] 	DeviceID:	0x18262f0
+I0906 13:58:55.746088 16537 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
+I0906 13:58:55.746098 16537 device.cpp:393] 	Is it integrated GPU?:	0
+I0906 13:58:55.746105 16537 device.cpp:393] 	Max clock frequency MHz:	930
+I0906 13:58:55.746111 16537 device.cpp:393] 	Host-Device unified mem:	0
+I0906 13:58:55.746117 16537 device.cpp:393] 	ECC support:	0
+I0906 13:58:55.746124 16537 device.cpp:393] 	Endian little:	1
+I0906 13:58:55.746130 16537 device.cpp:393] 	Max compute units:	44
+I0906 13:58:55.746136 16537 device.cpp:393] 	Max work group size:	256
+I0906 13:58:55.746145 16537 device.cpp:393] 	Max work item dimensions:	3
+I0906 13:58:55.746151 16537 device.cpp:393] 	Max work item sizes:	0x100
+I0906 13:58:55.746160 16537 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
+I0906 13:58:55.746167 16537 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
+I0906 13:58:55.746173 16537 device.cpp:393] 	Max mem alloc size:	4244635648
+I0906 13:58:55.746179 16537 device.cpp:393] 	Global mem size:	16878927872
+I0906 13:58:55.746186 16537 device.cpp:393] 	Local mem size:	32768
+I0906 13:58:55.746198 16537 device.cpp:96] Picked device type : GPU 0
+I0906 13:58:58.131669 16537 device.cpp:152] Build Program
+I0906 13:58:58.131891 16537 caffe.cpp:122] Starting Optimization
+I0906 13:58:58.132027 16537 solver.cpp:40] Initializing solver from parameters: 
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+display: 1
+max_iter: 10
+lr_policy: "step"
+gamma: 0.1
+momentum: 0.9
+weight_decay: 0.0005
+stepsize: 100000
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+I0906 13:58:58.132150 16537 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:58:58.133236 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
+I0906 13:58:58.133285 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
+I0906 13:58:58.133460 16537 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TRAIN
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:58:58.133894 16537 net.cpp:68] Memory required for data: 0
+I0906 13:58:58.134050 16537 layer_factory.hpp:74] Creating layer data
+I0906 13:58:58.134104 16537 net.cpp:91] Creating Layer data
+I0906 13:58:58.134125 16537 net.cpp:369] data -> data
+I0906 13:58:58.134229 16537 net.cpp:369] data -> label
+I0906 13:58:58.134253 16537 net.cpp:121] Setting up data
+I0906 13:58:58.134266 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:58:58.143668 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
+I0906 13:58:58.144057 16537 data_layer.cpp:53] output data size: 100,3,227,227
+I0906 13:58:58.175259 16537 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:58:58.175475 16537 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:58:58.175534 16537 net.cpp:128] Top shape: 100 3 227 227 (15458700)
+I0906 13:58:58.175544 16537 net.cpp:128] Top shape: 100 (100)
+I0906 13:58:58.175547 16537 net.cpp:134] Memory required for data: 61835200
+I0906 13:58:58.175582 16537 layer_factory.hpp:74] Creating layer conv1
+I0906 13:58:58.175659 16537 net.cpp:91] Creating Layer conv1
+I0906 13:58:58.175683 16537 net.cpp:411] conv1 <- data
+I0906 13:58:58.175760 16537 net.cpp:369] conv1 -> conv1
+I0906 13:58:58.175793 16537 net.cpp:121] Setting up conv1
+I0906 13:58:58.180706 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:58.180712 16537 net.cpp:134] Memory required for data: 177995200
+I0906 13:58:58.180752 16537 layer_factory.hpp:74] Creating layer relu1
+I0906 13:58:58.180774 16537 net.cpp:91] Creating Layer relu1
+I0906 13:58:58.180780 16537 net.cpp:411] relu1 <- conv1
+I0906 13:58:58.180794 16537 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:58:58.180804 16537 net.cpp:121] Setting up relu1
+I0906 13:58:58.180811 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:58.180815 16537 net.cpp:134] Memory required for data: 294155200
+I0906 13:58:58.180821 16537 layer_factory.hpp:74] Creating layer norm1
+I0906 13:58:58.180848 16537 net.cpp:91] Creating Layer norm1
+I0906 13:58:58.180855 16537 net.cpp:411] norm1 <- conv1
+I0906 13:58:58.180867 16537 net.cpp:369] norm1 -> norm1
+I0906 13:58:58.180881 16537 net.cpp:121] Setting up norm1
+I0906 13:58:58.180905 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
+I0906 13:58:58.180909 16537 net.cpp:134] Memory required for data: 410315200
+I0906 13:58:58.180915 16537 layer_factory.hpp:74] Creating layer pool1
+I0906 13:58:58.180938 16537 net.cpp:91] Creating Layer pool1
+I0906 13:58:58.180944 16537 net.cpp:411] pool1 <- norm1
+I0906 13:58:58.180958 16537 net.cpp:369] pool1 -> pool1
+I0906 13:58:58.180970 16537 net.cpp:121] Setting up pool1
+I0906 13:58:58.180999 16537 net.cpp:128] Top shape: 100 96 27 27 (6998400)
+I0906 13:58:58.181004 16537 net.cpp:134] Memory required for data: 438308800
+I0906 13:58:58.181008 16537 layer_factory.hpp:74] Creating layer conv2
+I0906 13:58:58.181023 16537 net.cpp:91] Creating Layer conv2
+I0906 13:58:58.181030 16537 net.cpp:411] conv2 <- pool1
+I0906 13:58:58.181044 16537 net.cpp:369] conv2 -> conv2
+I0906 13:58:58.181056 16537 net.cpp:121] Setting up conv2
+I0906 13:58:58.221200 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:58.221215 16537 net.cpp:134] Memory required for data: 512958400
+I0906 13:58:58.221245 16537 layer_factory.hpp:74] Creating layer relu2
+I0906 13:58:58.221267 16537 net.cpp:91] Creating Layer relu2
+I0906 13:58:58.221277 16537 net.cpp:411] relu2 <- conv2
+I0906 13:58:58.221297 16537 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:58:58.221312 16537 net.cpp:121] Setting up relu2
+I0906 13:58:58.221320 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:58.221324 16537 net.cpp:134] Memory required for data: 587608000
+I0906 13:58:58.221329 16537 layer_factory.hpp:74] Creating layer norm2
+I0906 13:58:58.221346 16537 net.cpp:91] Creating Layer norm2
+I0906 13:58:58.221352 16537 net.cpp:411] norm2 <- conv2
+I0906 13:58:58.221366 16537 net.cpp:369] norm2 -> norm2
+I0906 13:58:58.221379 16537 net.cpp:121] Setting up norm2
+I0906 13:58:58.221397 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
+I0906 13:58:58.221402 16537 net.cpp:134] Memory required for data: 662257600
+I0906 13:58:58.221407 16537 layer_factory.hpp:74] Creating layer pool2
+I0906 13:58:58.221429 16537 net.cpp:91] Creating Layer pool2
+I0906 13:58:58.221436 16537 net.cpp:411] pool2 <- norm2
+I0906 13:58:58.221448 16537 net.cpp:369] pool2 -> pool2
+I0906 13:58:58.221460 16537 net.cpp:121] Setting up pool2
+I0906 13:58:58.221480 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:58.221484 16537 net.cpp:134] Memory required for data: 679563200
+I0906 13:58:58.221534 16537 layer_factory.hpp:74] Creating layer conv3
+I0906 13:58:58.221555 16537 net.cpp:91] Creating Layer conv3
+I0906 13:58:58.221561 16537 net.cpp:411] conv3 <- pool2
+I0906 13:58:58.221576 16537 net.cpp:369] conv3 -> conv3
+I0906 13:58:58.221592 16537 net.cpp:121] Setting up conv3
+I0906 13:58:58.338774 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:58.338798 16537 net.cpp:134] Memory required for data: 705521600
+I0906 13:58:58.338841 16537 layer_factory.hpp:74] Creating layer relu3
+I0906 13:58:58.338876 16537 net.cpp:91] Creating Layer relu3
+I0906 13:58:58.338891 16537 net.cpp:411] relu3 <- conv3
+I0906 13:58:58.338918 16537 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:58:58.338935 16537 net.cpp:121] Setting up relu3
+I0906 13:58:58.338944 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:58.338948 16537 net.cpp:134] Memory required for data: 731480000
+I0906 13:58:58.338953 16537 layer_factory.hpp:74] Creating layer conv4
+I0906 13:58:58.338979 16537 net.cpp:91] Creating Layer conv4
+I0906 13:58:58.338985 16537 net.cpp:411] conv4 <- conv3
+I0906 13:58:58.339002 16537 net.cpp:369] conv4 -> conv4
+I0906 13:58:58.339017 16537 net.cpp:121] Setting up conv4
+I0906 13:58:58.369153 16541 data_layer.cpp:120] Prefetch batch: 193 ms.
+I0906 13:58:58.369201 16541 data_layer.cpp:121]      Read time: 23.991 ms.
+I0906 13:58:58.369210 16541 data_layer.cpp:122] Transform time: 167.322 ms.
+I0906 13:58:58.426654 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:58.426676 16537 net.cpp:134] Memory required for data: 757438400
+I0906 13:58:58.426703 16537 layer_factory.hpp:74] Creating layer relu4
+I0906 13:58:58.426735 16537 net.cpp:91] Creating Layer relu4
+I0906 13:58:58.426749 16537 net.cpp:411] relu4 <- conv4
+I0906 13:58:58.426776 16537 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:58:58.426794 16537 net.cpp:121] Setting up relu4
+I0906 13:58:58.426802 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
+I0906 13:58:58.426806 16537 net.cpp:134] Memory required for data: 783396800
+I0906 13:58:58.426811 16537 layer_factory.hpp:74] Creating layer conv5
+I0906 13:58:58.426838 16537 net.cpp:91] Creating Layer conv5
+I0906 13:58:58.426843 16537 net.cpp:411] conv5 <- conv4
+I0906 13:58:58.426858 16537 net.cpp:369] conv5 -> conv5
+I0906 13:58:58.426873 16537 net.cpp:121] Setting up conv5
+I0906 13:58:58.484124 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:58.484143 16537 net.cpp:134] Memory required for data: 800702400
+I0906 13:58:58.484182 16537 layer_factory.hpp:74] Creating layer relu5
+I0906 13:58:58.484212 16537 net.cpp:91] Creating Layer relu5
+I0906 13:58:58.484225 16537 net.cpp:411] relu5 <- conv5
+I0906 13:58:58.484251 16537 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:58:58.484266 16537 net.cpp:121] Setting up relu5
+I0906 13:58:58.484274 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
+I0906 13:58:58.484278 16537 net.cpp:134] Memory required for data: 818008000
+I0906 13:58:58.484282 16537 layer_factory.hpp:74] Creating layer pool5
+I0906 13:58:58.484302 16537 net.cpp:91] Creating Layer pool5
+I0906 13:58:58.484308 16537 net.cpp:411] pool5 <- conv5
+I0906 13:58:58.484321 16537 net.cpp:369] pool5 -> pool5
+I0906 13:58:58.484335 16537 net.cpp:121] Setting up pool5
+I0906 13:58:58.484355 16537 net.cpp:128] Top shape: 100 256 6 6 (921600)
+I0906 13:58:58.484359 16537 net.cpp:134] Memory required for data: 821694400
+I0906 13:58:58.484364 16537 layer_factory.hpp:74] Creating layer fc6
+I0906 13:58:58.484400 16537 net.cpp:91] Creating Layer fc6
+I0906 13:58:58.484405 16537 net.cpp:411] fc6 <- pool5
+I0906 13:58:58.484421 16537 net.cpp:369] fc6 -> fc6
+I0906 13:58:58.484434 16537 net.cpp:121] Setting up fc6
+I0906 13:59:03.394265 16537 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:59:03.394289 16537 net.cpp:134] Memory required for data: 823332800
+I0906 13:59:03.394316 16537 layer_factory.hpp:74] Creating layer relu6
+I0906 13:59:03.394362 16537 net.cpp:91] Creating Layer relu6
+I0906 13:59:03.394378 16537 net.cpp:411] relu6 <- fc6
+I0906 13:59:03.394405 16537 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:59:03.394472 16537 net.cpp:121] Setting up relu6
+I0906 13:59:03.394482 16537 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:59:03.394486 16537 net.cpp:134] Memory required for data: 824971200
+I0906 13:59:03.394492 16537 layer_factory.hpp:74] Creating layer fc7
+I0906 13:59:03.394515 16537 net.cpp:91] Creating Layer fc7
+I0906 13:59:03.394521 16537 net.cpp:411] fc7 <- fc6
+I0906 13:59:03.394537 16537 net.cpp:369] fc7 -> fc7
+I0906 13:59:03.394558 16537 net.cpp:121] Setting up fc7
+I0906 13:59:05.554731 16537 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:59:05.554755 16537 net.cpp:134] Memory required for data: 826609600
+I0906 13:59:05.554782 16537 layer_factory.hpp:74] Creating layer relu7
+I0906 13:59:05.554815 16537 net.cpp:91] Creating Layer relu7
+I0906 13:59:05.554829 16537 net.cpp:411] relu7 <- fc7
+I0906 13:59:05.554855 16537 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:59:05.554870 16537 net.cpp:121] Setting up relu7
+I0906 13:59:05.554879 16537 net.cpp:128] Top shape: 100 4096 (409600)
+I0906 13:59:05.554883 16537 net.cpp:134] Memory required for data: 828248000
+I0906 13:59:05.554888 16537 layer_factory.hpp:74] Creating layer fc8
+I0906 13:59:05.554911 16537 net.cpp:91] Creating Layer fc8
+I0906 13:59:05.554916 16537 net.cpp:411] fc8 <- fc7
+I0906 13:59:05.554932 16537 net.cpp:369] fc8 -> fc8
+I0906 13:59:05.554946 16537 net.cpp:121] Setting up fc8
+I0906 13:59:06.080322 16537 net.cpp:128] Top shape: 100 1000 (100000)
+I0906 13:59:06.080343 16537 net.cpp:134] Memory required for data: 828648000
+I0906 13:59:06.080370 16537 layer_factory.hpp:74] Creating layer loss
+I0906 13:59:06.080420 16537 net.cpp:91] Creating Layer loss
+I0906 13:59:06.080435 16537 net.cpp:411] loss <- fc8
+I0906 13:59:06.080457 16537 net.cpp:411] loss <- label
+I0906 13:59:06.080476 16537 net.cpp:369] loss -> loss
+I0906 13:59:06.080497 16537 net.cpp:121] Setting up loss
+I0906 13:59:06.080515 16537 layer_factory.hpp:74] Creating layer loss
+I0906 13:59:06.081025 16537 net.cpp:128] Top shape: (1)
+I0906 13:59:06.081030 16537 net.cpp:130]     with loss weight 1
+I0906 13:59:06.081048 16537 net.cpp:134] Memory required for data: 828648004
+I0906 13:59:06.081055 16537 net.cpp:193] loss needs backward computation.
+I0906 13:59:06.081063 16537 net.cpp:193] fc8 needs backward computation.
+I0906 13:59:06.081069 16537 net.cpp:193] relu7 needs backward computation.
+I0906 13:59:06.081074 16537 net.cpp:193] fc7 needs backward computation.
+I0906 13:59:06.081080 16537 net.cpp:193] relu6 needs backward computation.
+I0906 13:59:06.081086 16537 net.cpp:193] fc6 needs backward computation.
+I0906 13:59:06.081091 16537 net.cpp:193] pool5 needs backward computation.
+I0906 13:59:06.081097 16537 net.cpp:193] relu5 needs backward computation.
+I0906 13:59:06.081102 16537 net.cpp:193] conv5 needs backward computation.
+I0906 13:59:06.081109 16537 net.cpp:193] relu4 needs backward computation.
+I0906 13:59:06.081114 16537 net.cpp:193] conv4 needs backward computation.
+I0906 13:59:06.081120 16537 net.cpp:193] relu3 needs backward computation.
+I0906 13:59:06.081125 16537 net.cpp:193] conv3 needs backward computation.
+I0906 13:59:06.081132 16537 net.cpp:193] pool2 needs backward computation.
+I0906 13:59:06.081138 16537 net.cpp:193] norm2 needs backward computation.
+I0906 13:59:06.081145 16537 net.cpp:193] relu2 needs backward computation.
+I0906 13:59:06.081149 16537 net.cpp:193] conv2 needs backward computation.
+I0906 13:59:06.081156 16537 net.cpp:193] pool1 needs backward computation.
+I0906 13:59:06.081161 16537 net.cpp:193] norm1 needs backward computation.
+I0906 13:59:06.081167 16537 net.cpp:193] relu1 needs backward computation.
+I0906 13:59:06.081173 16537 net.cpp:193] conv1 needs backward computation.
+I0906 13:59:06.081181 16537 net.cpp:195] data does not need backward computation.
+I0906 13:59:06.081187 16537 net.cpp:236] This network produces output loss
+I0906 13:59:06.081223 16537 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:59:06.081238 16537 net.cpp:248] Network initialization done.
+I0906 13:59:06.081241 16537 net.cpp:249] Memory required for data: 828648004
+I0906 13:59:06.082168 16537 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
+I0906 13:59:06.082299 16537 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
+I0906 13:59:06.082527 16537 net.cpp:43] Initializing net from parameters: 
+name: "AlexNet"
+state {
+  phase: TEST
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+I0906 13:59:06.082866 16537 net.cpp:68] Memory required for data: 0
+I0906 13:59:06.082913 16537 layer_factory.hpp:74] Creating layer data
+I0906 13:59:06.082934 16537 net.cpp:91] Creating Layer data
+I0906 13:59:06.082944 16537 net.cpp:369] data -> data
+I0906 13:59:06.082967 16537 net.cpp:369] data -> label
+I0906 13:59:06.082981 16537 net.cpp:121] Setting up data
+I0906 13:59:06.082988 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
+I0906 13:59:06.091397 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
+I0906 13:59:06.091647 16537 data_layer.cpp:53] output data size: 50,3,227,227
+I0906 13:59:06.107939 16537 base_data_layer.cpp:43] Initializing prefetch
+I0906 13:59:06.108054 16537 base_data_layer.cpp:45] Prefetch initialized.
+I0906 13:59:06.108088 16537 net.cpp:128] Top shape: 50 3 227 227 (7729350)
+I0906 13:59:06.108098 16537 net.cpp:128] Top shape: 50 (50)
+I0906 13:59:06.108101 16537 net.cpp:134] Memory required for data: 30917600
+I0906 13:59:06.108135 16537 layer_factory.hpp:74] Creating layer label_data_1_split
+I0906 13:59:06.108201 16537 net.cpp:91] Creating Layer label_data_1_split
+I0906 13:59:06.108216 16537 net.cpp:411] label_data_1_split <- label
+I0906 13:59:06.108259 16537 net.cpp:369] label_data_1_split -> label_data_1_split_0
+I0906 13:59:06.108306 16537 net.cpp:369] label_data_1_split -> label_data_1_split_1
+I0906 13:59:06.108319 16537 net.cpp:121] Setting up label_data_1_split
+I0906 13:59:06.108353 16537 net.cpp:128] Top shape: 50 (50)
+I0906 13:59:06.108361 16537 net.cpp:128] Top shape: 50 (50)
+I0906 13:59:06.108364 16537 net.cpp:134] Memory required for data: 30918000
+I0906 13:59:06.108369 16537 layer_factory.hpp:74] Creating layer conv1
+I0906 13:59:06.108403 16537 net.cpp:91] Creating Layer conv1
+I0906 13:59:06.108409 16537 net.cpp:411] conv1 <- data
+I0906 13:59:06.108425 16537 net.cpp:369] conv1 -> conv1
+I0906 13:59:06.108440 16537 net.cpp:121] Setting up conv1
+I0906 13:59:06.113059 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:59:06.113065 16537 net.cpp:134] Memory required for data: 88998000
+I0906 13:59:06.113085 16537 layer_factory.hpp:74] Creating layer relu1
+I0906 13:59:06.113097 16537 net.cpp:91] Creating Layer relu1
+I0906 13:59:06.113103 16537 net.cpp:411] relu1 <- conv1
+I0906 13:59:06.113116 16537 net.cpp:358] relu1 -> conv1 (in-place)
+I0906 13:59:06.113126 16537 net.cpp:121] Setting up relu1
+I0906 13:59:06.113134 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:59:06.113138 16537 net.cpp:134] Memory required for data: 147078000
+I0906 13:59:06.113143 16537 layer_factory.hpp:74] Creating layer norm1
+I0906 13:59:06.113163 16537 net.cpp:91] Creating Layer norm1
+I0906 13:59:06.113169 16537 net.cpp:411] norm1 <- conv1
+I0906 13:59:06.113183 16537 net.cpp:369] norm1 -> norm1
+I0906 13:59:06.113193 16537 net.cpp:121] Setting up norm1
+I0906 13:59:06.113212 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
+I0906 13:59:06.113255 16537 net.cpp:134] Memory required for data: 205158000
+I0906 13:59:06.113260 16537 layer_factory.hpp:74] Creating layer pool1
+I0906 13:59:06.113277 16537 net.cpp:91] Creating Layer pool1
+I0906 13:59:06.113282 16537 net.cpp:411] pool1 <- norm1
+I0906 13:59:06.113296 16537 net.cpp:369] pool1 -> pool1
+I0906 13:59:06.113306 16537 net.cpp:121] Setting up pool1
+I0906 13:59:06.113325 16537 net.cpp:128] Top shape: 50 96 27 27 (3499200)
+I0906 13:59:06.113329 16537 net.cpp:134] Memory required for data: 219154800
+I0906 13:59:06.113334 16537 layer_factory.hpp:74] Creating layer conv2
+I0906 13:59:06.113348 16537 net.cpp:91] Creating Layer conv2
+I0906 13:59:06.113354 16537 net.cpp:411] conv2 <- pool1
+I0906 13:59:06.113369 16537 net.cpp:369] conv2 -> conv2
+I0906 13:59:06.113381 16537 net.cpp:121] Setting up conv2
+I0906 13:59:06.154265 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:59:06.154281 16537 net.cpp:134] Memory required for data: 256479600
+I0906 13:59:06.154316 16537 layer_factory.hpp:74] Creating layer relu2
+I0906 13:59:06.154345 16537 net.cpp:91] Creating Layer relu2
+I0906 13:59:06.154355 16537 net.cpp:411] relu2 <- conv2
+I0906 13:59:06.154374 16537 net.cpp:358] relu2 -> conv2 (in-place)
+I0906 13:59:06.154387 16537 net.cpp:121] Setting up relu2
+I0906 13:59:06.154397 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:59:06.154400 16537 net.cpp:134] Memory required for data: 293804400
+I0906 13:59:06.154405 16537 layer_factory.hpp:74] Creating layer norm2
+I0906 13:59:06.154427 16537 net.cpp:91] Creating Layer norm2
+I0906 13:59:06.154433 16537 net.cpp:411] norm2 <- conv2
+I0906 13:59:06.154446 16537 net.cpp:369] norm2 -> norm2
+I0906 13:59:06.154463 16537 net.cpp:121] Setting up norm2
+I0906 13:59:06.154484 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
+I0906 13:59:06.154503 16537 net.cpp:134] Memory required for data: 331129200
+I0906 13:59:06.154508 16537 layer_factory.hpp:74] Creating layer pool2
+I0906 13:59:06.154525 16537 net.cpp:91] Creating Layer pool2
+I0906 13:59:06.154531 16537 net.cpp:411] pool2 <- norm2
+I0906 13:59:06.154544 16537 net.cpp:369] pool2 -> pool2
+I0906 13:59:06.154556 16537 net.cpp:121] Setting up pool2
+I0906 13:59:06.154573 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:59:06.154578 16537 net.cpp:134] Memory required for data: 339782000
+I0906 13:59:06.154583 16537 layer_factory.hpp:74] Creating layer conv3
+I0906 13:59:06.154604 16537 net.cpp:91] Creating Layer conv3
+I0906 13:59:06.154610 16537 net.cpp:411] conv3 <- pool2
+I0906 13:59:06.154625 16537 net.cpp:369] conv3 -> conv3
+I0906 13:59:06.154638 16537 net.cpp:121] Setting up conv3
+I0906 13:59:06.204232 16545 data_layer.cpp:120] Prefetch batch: 96 ms.
+I0906 13:59:06.204263 16545 data_layer.cpp:121]      Read time: 12.163 ms.
+I0906 13:59:06.204272 16545 data_layer.cpp:122] Transform time: 82.876 ms.
+I0906 13:59:06.270438 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:59:06.270459 16537 net.cpp:134] Memory required for data: 352761200
+I0906 13:59:06.270499 16537 layer_factory.hpp:74] Creating layer relu3
+I0906 13:59:06.270532 16537 net.cpp:91] Creating Layer relu3
+I0906 13:59:06.270546 16537 net.cpp:411] relu3 <- conv3
+I0906 13:59:06.270571 16537 net.cpp:358] relu3 -> conv3 (in-place)
+I0906 13:59:06.270587 16537 net.cpp:121] Setting up relu3
+I0906 13:59:06.270596 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:59:06.270601 16537 net.cpp:134] Memory required for data: 365740400
+I0906 13:59:06.270606 16537 layer_factory.hpp:74] Creating layer conv4
+I0906 13:59:06.270630 16537 net.cpp:91] Creating Layer conv4
+I0906 13:59:06.270637 16537 net.cpp:411] conv4 <- conv3
+I0906 13:59:06.270651 16537 net.cpp:369] conv4 -> conv4
+I0906 13:59:06.270666 16537 net.cpp:121] Setting up conv4
+I0906 13:59:06.357051 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:59:06.357074 16537 net.cpp:134] Memory required for data: 378719600
+I0906 13:59:06.357100 16537 layer_factory.hpp:74] Creating layer relu4
+I0906 13:59:06.357132 16537 net.cpp:91] Creating Layer relu4
+I0906 13:59:06.357184 16537 net.cpp:411] relu4 <- conv4
+I0906 13:59:06.357210 16537 net.cpp:358] relu4 -> conv4 (in-place)
+I0906 13:59:06.357226 16537 net.cpp:121] Setting up relu4
+I0906 13:59:06.357235 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
+I0906 13:59:06.357239 16537 net.cpp:134] Memory required for data: 391698800
+I0906 13:59:06.357244 16537 layer_factory.hpp:74] Creating layer conv5
+I0906 13:59:06.357270 16537 net.cpp:91] Creating Layer conv5
+I0906 13:59:06.357276 16537 net.cpp:411] conv5 <- conv4
+I0906 13:59:06.357292 16537 net.cpp:369] conv5 -> conv5
+I0906 13:59:06.357308 16537 net.cpp:121] Setting up conv5
+I0906 13:59:06.414666 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:59:06.414685 16537 net.cpp:134] Memory required for data: 400351600
+I0906 13:59:06.414727 16537 layer_factory.hpp:74] Creating layer relu5
+I0906 13:59:06.414757 16537 net.cpp:91] Creating Layer relu5
+I0906 13:59:06.414770 16537 net.cpp:411] relu5 <- conv5
+I0906 13:59:06.414794 16537 net.cpp:358] relu5 -> conv5 (in-place)
+I0906 13:59:06.414808 16537 net.cpp:121] Setting up relu5
+I0906 13:59:06.414818 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
+I0906 13:59:06.414820 16537 net.cpp:134] Memory required for data: 409004400
+I0906 13:59:06.414825 16537 layer_factory.hpp:74] Creating layer pool5
+I0906 13:59:06.414855 16537 net.cpp:91] Creating Layer pool5
+I0906 13:59:06.414860 16537 net.cpp:411] pool5 <- conv5
+I0906 13:59:06.414875 16537 net.cpp:369] pool5 -> pool5
+I0906 13:59:06.414888 16537 net.cpp:121] Setting up pool5
+I0906 13:59:06.414908 16537 net.cpp:128] Top shape: 50 256 6 6 (460800)
+I0906 13:59:06.414912 16537 net.cpp:134] Memory required for data: 410847600
+I0906 13:59:06.414917 16537 layer_factory.hpp:74] Creating layer fc6
+I0906 13:59:06.414938 16537 net.cpp:91] Creating Layer fc6
+I0906 13:59:06.414944 16537 net.cpp:411] fc6 <- pool5
+I0906 13:59:06.414959 16537 net.cpp:369] fc6 -> fc6
+I0906 13:59:06.414971 16537 net.cpp:121] Setting up fc6
+I0906 13:59:11.292778 16537 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:59:11.292801 16537 net.cpp:134] Memory required for data: 411666800
+I0906 13:59:11.292829 16537 layer_factory.hpp:74] Creating layer relu6
+I0906 13:59:11.292860 16537 net.cpp:91] Creating Layer relu6
+I0906 13:59:11.292876 16537 net.cpp:411] relu6 <- fc6
+I0906 13:59:11.292902 16537 net.cpp:358] relu6 -> fc6 (in-place)
+I0906 13:59:11.292918 16537 net.cpp:121] Setting up relu6
+I0906 13:59:11.292927 16537 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:59:11.292932 16537 net.cpp:134] Memory required for data: 412486000
+I0906 13:59:11.292937 16537 layer_factory.hpp:74] Creating layer fc7
+I0906 13:59:11.292958 16537 net.cpp:91] Creating Layer fc7
+I0906 13:59:11.292964 16537 net.cpp:411] fc7 <- fc6
+I0906 13:59:11.292980 16537 net.cpp:369] fc7 -> fc7
+I0906 13:59:11.292995 16537 net.cpp:121] Setting up fc7
+I0906 13:59:13.449043 16537 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:59:13.449066 16537 net.cpp:134] Memory required for data: 413305200
+I0906 13:59:13.449095 16537 layer_factory.hpp:74] Creating layer relu7
+I0906 13:59:13.449126 16537 net.cpp:91] Creating Layer relu7
+I0906 13:59:13.449141 16537 net.cpp:411] relu7 <- fc7
+I0906 13:59:13.449167 16537 net.cpp:358] relu7 -> fc7 (in-place)
+I0906 13:59:13.449182 16537 net.cpp:121] Setting up relu7
+I0906 13:59:13.449192 16537 net.cpp:128] Top shape: 50 4096 (204800)
+I0906 13:59:13.449195 16537 net.cpp:134] Memory required for data: 414124400
+I0906 13:59:13.449200 16537 layer_factory.hpp:74] Creating layer fc8
+I0906 13:59:13.449223 16537 net.cpp:91] Creating Layer fc8
+I0906 13:59:13.449229 16537 net.cpp:411] fc8 <- fc7
+I0906 13:59:13.449244 16537 net.cpp:369] fc8 -> fc8
+I0906 13:59:13.449270 16537 net.cpp:121] Setting up fc8
+I0906 13:59:13.974771 16537 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:59:13.974793 16537 net.cpp:134] Memory required for data: 414324400
+I0906 13:59:13.974820 16537 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
+I0906 13:59:13.974851 16537 net.cpp:91] Creating Layer fc8_fc8_0_split
+I0906 13:59:13.974911 16537 net.cpp:411] fc8_fc8_0_split <- fc8
+I0906 13:59:13.974939 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
+I0906 13:59:13.974962 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
+I0906 13:59:13.974974 16537 net.cpp:121] Setting up fc8_fc8_0_split
+I0906 13:59:13.974992 16537 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:59:13.974998 16537 net.cpp:128] Top shape: 50 1000 (50000)
+I0906 13:59:13.975003 16537 net.cpp:134] Memory required for data: 414724400
+I0906 13:59:13.975006 16537 layer_factory.hpp:74] Creating layer accuracy
+I0906 13:59:13.975038 16537 net.cpp:91] Creating Layer accuracy
+I0906 13:59:13.975044 16537 net.cpp:411] accuracy <- fc8_fc8_0_split_0
+I0906 13:59:13.975054 16537 net.cpp:411] accuracy <- label_data_1_split_0
+I0906 13:59:13.975065 16537 net.cpp:369] accuracy -> accuracy
+I0906 13:59:13.975076 16537 net.cpp:121] Setting up accuracy
+I0906 13:59:13.975092 16537 net.cpp:128] Top shape: (1)
+I0906 13:59:13.975096 16537 net.cpp:134] Memory required for data: 414724404
+I0906 13:59:13.975101 16537 layer_factory.hpp:74] Creating layer loss
+I0906 13:59:13.975112 16537 net.cpp:91] Creating Layer loss
+I0906 13:59:13.975117 16537 net.cpp:411] loss <- fc8_fc8_0_split_1
+I0906 13:59:13.975128 16537 net.cpp:411] loss <- label_data_1_split_1
+I0906 13:59:13.975139 16537 net.cpp:369] loss -> loss
+I0906 13:59:13.975150 16537 net.cpp:121] Setting up loss
+I0906 13:59:13.975160 16537 layer_factory.hpp:74] Creating layer loss
+I0906 13:59:13.975487 16537 net.cpp:128] Top shape: (1)
+I0906 13:59:13.975492 16537 net.cpp:130]     with loss weight 1
+I0906 13:59:13.975507 16537 net.cpp:134] Memory required for data: 414724408
+I0906 13:59:13.975513 16537 net.cpp:193] loss needs backward computation.
+I0906 13:59:13.975520 16537 net.cpp:195] accuracy does not need backward computation.
+I0906 13:59:13.975528 16537 net.cpp:193] fc8_fc8_0_split needs backward computation.
+I0906 13:59:13.975533 16537 net.cpp:193] fc8 needs backward computation.
+I0906 13:59:13.975538 16537 net.cpp:193] relu7 needs backward computation.
+I0906 13:59:13.975544 16537 net.cpp:193] fc7 needs backward computation.
+I0906 13:59:13.975549 16537 net.cpp:193] relu6 needs backward computation.
+I0906 13:59:13.975555 16537 net.cpp:193] fc6 needs backward computation.
+I0906 13:59:13.975560 16537 net.cpp:193] pool5 needs backward computation.
+I0906 13:59:13.975566 16537 net.cpp:193] relu5 needs backward computation.
+I0906 13:59:13.975572 16537 net.cpp:193] conv5 needs backward computation.
+I0906 13:59:13.975577 16537 net.cpp:193] relu4 needs backward computation.
+I0906 13:59:13.975582 16537 net.cpp:193] conv4 needs backward computation.
+I0906 13:59:13.975589 16537 net.cpp:193] relu3 needs backward computation.
+I0906 13:59:13.975594 16537 net.cpp:193] conv3 needs backward computation.
+I0906 13:59:13.975600 16537 net.cpp:193] pool2 needs backward computation.
+I0906 13:59:13.975605 16537 net.cpp:193] norm2 needs backward computation.
+I0906 13:59:13.975611 16537 net.cpp:193] relu2 needs backward computation.
+I0906 13:59:13.975616 16537 net.cpp:193] conv2 needs backward computation.
+I0906 13:59:13.975622 16537 net.cpp:193] pool1 needs backward computation.
+I0906 13:59:13.975628 16537 net.cpp:193] norm1 needs backward computation.
+I0906 13:59:13.975635 16537 net.cpp:193] relu1 needs backward computation.
+I0906 13:59:13.975639 16537 net.cpp:193] conv1 needs backward computation.
+I0906 13:59:13.975646 16537 net.cpp:195] label_data_1_split does not need backward computation.
+I0906 13:59:13.975654 16537 net.cpp:195] data does not need backward computation.
+I0906 13:59:13.975658 16537 net.cpp:236] This network produces output accuracy
+I0906 13:59:13.975664 16537 net.cpp:236] This network produces output loss
+I0906 13:59:13.975702 16537 net.cpp:483] Collecting Learning Rate and Weight Decay.
+I0906 13:59:13.975714 16537 net.cpp:248] Network initialization done.
+I0906 13:59:13.975718 16537 net.cpp:249] Memory required for data: 414724408
+I0906 13:59:13.975903 16537 solver.cpp:53] Solver scaffolding done.
+I0906 13:59:13.976030 16537 solver.cpp:270] Solving AlexNet
+I0906 13:59:13.976050 16537 solver.cpp:271] Learning Rate Policy: step
+I0906 13:59:13.977635 16537 solver.cpp:314] Iteration 0, Testing net (#0)
+I0906 13:59:13.977653 16537 net.cpp:696] Copying source layer data
+I0906 13:59:13.977660 16537 net.cpp:696] Copying source layer conv1
+I0906 13:59:13.980556 16537 net.cpp:696] Copying source layer relu1
+I0906 13:59:13.980595 16537 net.cpp:696] Copying source layer norm1
+I0906 13:59:13.980607 16537 net.cpp:696] Copying source layer pool1
+I0906 13:59:13.980617 16537 net.cpp:696] Copying source layer conv2
+I0906 13:59:13.980785 16537 net.cpp:696] Copying source layer relu2
+I0906 13:59:13.980798 16537 net.cpp:696] Copying source layer norm2
+I0906 13:59:13.980808 16537 net.cpp:696] Copying source layer pool2
+I0906 13:59:13.980818 16537 net.cpp:696] Copying source layer conv3
+I0906 13:59:13.981422 16537 net.cpp:696] Copying source layer relu3
+I0906 13:59:13.981437 16537 net.cpp:696] Copying source layer conv4
+I0906 13:59:13.982098 16537 net.cpp:696] Copying source layer relu4
+I0906 13:59:13.982115 16537 net.cpp:696] Copying source layer conv5
+I0906 13:59:13.982612 16537 net.cpp:696] Copying source layer relu5
+I0906 13:59:13.982626 16537 net.cpp:696] Copying source layer pool5
+I0906 13:59:13.982636 16537 net.cpp:696] Copying source layer fc6
+I0906 13:59:13.993058 16537 net.cpp:696] Copying source layer relu6
+I0906 13:59:13.993091 16537 net.cpp:696] Copying source layer fc7
+I0906 13:59:13.997967 16537 net.cpp:696] Copying source layer relu7
+I0906 13:59:13.997984 16537 net.cpp:696] Copying source layer fc8
+I0906 13:59:13.998755 16537 net.cpp:696] Copying source layer loss
+I0906 13:59:13.998867 16537 base_data_layer.cpp:89] Thread joined
+I0906 13:59:14.003283 16537 base_data_layer.cpp:93] Prefetch copied
+I0906 13:59:14.003650 16537 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:59:14.096194 16546 data_layer.cpp:120] Prefetch batch: 92 ms.
+I0906 13:59:14.096225 16546 data_layer.cpp:121]      Read time: 12.131 ms.
+I0906 13:59:14.096233 16546 data_layer.cpp:122] Transform time: 79.106 ms.
+I0906 13:59:17.032117 16537 solver.cpp:363]     Test net output #0: accuracy = 0
+I0906 13:59:17.032146 16537 solver.cpp:363]     Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss)
+I0906 13:59:17.032196 16537 base_data_layer.cpp:89] Thread joined
+I0906 13:59:17.041095 16537 base_data_layer.cpp:93] Prefetch copied
+I0906 13:59:17.041471 16537 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:59:17.232076 16547 data_layer.cpp:120] Prefetch batch: 190 ms.
+I0906 13:59:17.232108 16547 data_layer.cpp:121]      Read time: 24.399 ms.
+I0906 13:59:17.232116 16547 data_layer.cpp:122] Transform time: 164.272 ms.
+I0906 13:59:23.802855 16537 solver.cpp:234] Iteration 0, loss = 0
+I0906 13:59:23.802914 16537 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
+I0906 13:59:23.802963 16537 solver.cpp:506] Iteration 0, lr = 0.01
+I0906 13:59:23.918314 16537 base_data_layer.cpp:89] Thread joined
+I0906 13:59:23.926301 16537 base_data_layer.cpp:93] Prefetch copied
+I0906 13:59:23.926447 16537 base_data_layer.cpp:104] CreatePrefetchThread
+I0906 13:59:24.110566 16549 data_layer.cpp:120] Prefetch batch: 183 ms.
+I0906 13:59:24.110599 16549 data_layer.cpp:121]      Read time: 23.839 ms.
+I0906 13:59:24.110605 16549 data_layer.cpp:122] Transform time: 158.415 ms.
+I0906 13:59:26.694295 16537 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.INFO b/log/caffe.INFO
new file mode 120000
index 00000000..65520a80
--- /dev/null
+++ b/log/caffe.INFO
@@ -0,0 +1 @@
+caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
\ No newline at end of file
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index df2de2e0..7e745410 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -33,7 +33,7 @@
 #include <dirent.h>
 
 namespace caffe {
-char* buildOption = "-x clc++ ";
+string buildOption = "-x clc++ ";
 std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
 
@@ -148,7 +148,7 @@ void Device::BuildProgram(std::string kernel_dir)
     if(NULL == Program){
         fprintf(stderr,"Err: Failed to create program\n");
     }
-    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption, NULL, NULL);
+    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), NULL, NULL);
     LOG(INFO) << "Build Program";
     if(CL_SUCCESS != iStatus){
         fprintf(stderr,"Err: Failed to build program\n");
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 5ea9b6b5..cd9d2ef5 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -229,7 +229,7 @@ void Solver<Dtype>::Step(int iters) {
       losses[idx] = loss;
       printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx);
     }
-       printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %d \n", smoothed_loss,average_loss, losses.size());
+      printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", smoothed_loss,average_loss, losses.size());
     if (display) {
       LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 123b0053..8cf9bc7b 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -48,16 +48,6 @@ if (cpu_ptr_ && own_cpu_data_) {
   }
 
   clReleaseKernel(oclmem_kernel);
-/*  if (cpu_ptr_ && own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
-  }
-
-#ifndef CPU_ONLY
-  if (gpu_ptr_) {
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-  }
-#endif  // CPU_ONLY
-*/
 }	
 
 void SyncedMemory::ocl_setup() {
@@ -69,13 +59,7 @@ void SyncedMemory::ocl_setup() {
 inline void SyncedMemory::to_cpu() {
 switch (head_) {
   case UNINITIALIZED:
-    //allocate pre-pinned memory
-    //pinned_buffer_ptr_
-   // if(data_layer_){
-   // gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_PERSISTENT_MEM_AMD, size_, NULL, NULL);
-   // }
-   // else{
-      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
+    gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
     //}
     cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
     memset(cpu_ptr_, 0, size_);
@@ -151,17 +135,6 @@ const void* SyncedMemory::cpu_data() {
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-/*CHECK(data);
-  if (own_cpu_data_) {
-  OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL));
-  OCL_CHECK( clReleaseMemObject((cl_mem) gpu_cache_ptr_));
-  clFinish(amdDevice.CommandQueue); //is this necessary?
-  }
-  gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_USE_HOST_PTR, size_, data, NULL);
-  cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
-  head_ = HEAD_AT_CPU;
-  own_cpu_data_ = false;
-*/
   CHECK(data);
   if (own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_);
@@ -196,8 +169,10 @@ void* SyncedMemory::mutable_gpu_data() {
 #endif
 }
 
-const void *SyncedMemory::gpu_cache_data()
-{
+const void *SyncedMemory::gpu_cache_data() {
+  return 0;
 }
+
+
 }  // namespace caffe
 

From f96ca7623084ed162e94f952f606a07d72e9956d Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 6 Sep 2015 14:02:55 +0800
Subject: [PATCH 058/124] Clean up the last two warings

---
 ...SEARCH.yugao.log.INFO.20150906-133002.7951 | 1250 -----------------
 ...SEARCH.yugao.log.INFO.20150906-133358.8300 | 1208 ----------------
 ...SEARCH.yugao.log.INFO.20150906-133437.8316 | 1208 ----------------
 ...EARCH.yugao.log.INFO.20150906-135805.16515 | 1160 ---------------
 ...EARCH.yugao.log.INFO.20150906-135855.16537 | 1208 ----------------
 log/caffe.INFO                                |    1 -
 6 files changed, 6035 deletions(-)
 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
 delete mode 100644 log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
 delete mode 120000 log/caffe.INFO

diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
deleted file mode 100644
index c75e1aaa..00000000
--- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133002.7951
+++ /dev/null
@@ -1,1250 +0,0 @@
-Log file created at: 2015/09/06 13:30:02
-Running on machine: AMD-RESEARCH
-Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
-I0906 13:30:02.150327  7951 caffe.cpp:114] Use GPU with device ID 0
-I0906 13:30:02.187862  7951 device.cpp:230] Number of platforms found:1
-I0906 13:30:02.187903  7951 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
-I0906 13:30:02.187918  7951 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
-I0906 13:30:02.187973  7951 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
-I0906 13:30:02.187980  7951 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
-I0906 13:30:02.187991  7951 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
-I0906 13:30:02.188000  7951 device.cpp:286] Number of devices found:1
-I0906 13:30:02.188005  7951 device.cpp:288] 	DeviceID:	0x2171230
-I0906 13:30:02.188025  7951 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
-I0906 13:30:02.188033  7951 device.cpp:393] 	Is it integrated GPU?:	0
-I0906 13:30:02.188038  7951 device.cpp:393] 	Max clock frequency MHz:	930
-I0906 13:30:02.188043  7951 device.cpp:393] 	Host-Device unified mem:	0
-I0906 13:30:02.188048  7951 device.cpp:393] 	ECC support:	0
-I0906 13:30:02.188052  7951 device.cpp:393] 	Endian little:	1
-I0906 13:30:02.188056  7951 device.cpp:393] 	Max compute units:	44
-I0906 13:30:02.188061  7951 device.cpp:393] 	Max work group size:	256
-I0906 13:30:02.188066  7951 device.cpp:393] 	Max work item dimensions:	3
-I0906 13:30:02.188072  7951 device.cpp:393] 	Max work item sizes:	0x100
-I0906 13:30:02.188078  7951 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
-I0906 13:30:02.188083  7951 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
-I0906 13:30:02.188088  7951 device.cpp:393] 	Max mem alloc size:	4244635648
-I0906 13:30:02.188092  7951 device.cpp:393] 	Global mem size:	16878927872
-I0906 13:30:02.188097  7951 device.cpp:393] 	Local mem size:	32768
-I0906 13:30:02.188107  7951 device.cpp:96] Picked device type : GPU 0
-I0906 13:30:04.630481  7951 device.cpp:152] Build Program
-I0906 13:30:04.630708  7951 caffe.cpp:122] Starting Optimization
-I0906 13:30:04.630797  7951 solver.cpp:40] Initializing solver from parameters: 
-test_iter: 1
-test_interval: 1000
-base_lr: 0.01
-display: 1
-max_iter: 450000
-lr_policy: "step"
-gamma: 0.1
-momentum: 0.9
-weight_decay: 0.0005
-stepsize: 100000
-snapshot: 10000
-snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
-solver_mode: GPU
-net: "models/bvlc_alexnet/train_val.prototxt"
-I0906 13:30:04.630909  7951 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val.prototxt
-I0906 13:30:04.632081  7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
-I0906 13:30:04.632134  7951 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
-I0906 13:30:04.632319  7951 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TRAIN
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
-    batch_size: 256
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "drop6"
-  type: "Dropout"
-  bottom: "fc6"
-  top: "fc6"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "drop7"
-  type: "Dropout"
-  bottom: "fc7"
-  top: "fc7"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:30:04.632813  7951 net.cpp:68] Memory required for data: 0
-I0906 13:30:04.632977  7951 layer_factory.hpp:74] Creating layer data
-I0906 13:30:04.633033  7951 net.cpp:91] Creating Layer data
-I0906 13:30:04.633055  7951 net.cpp:369] data -> data
-I0906 13:30:04.633160  7951 net.cpp:369] data -> label
-I0906 13:30:04.633183  7951 net.cpp:121] Setting up data
-I0906 13:30:04.633196  7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:30:04.642779  7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
-I0906 13:30:04.643064  7951 data_layer.cpp:53] output data size: 256,3,227,227
-I0906 13:30:04.723888  7951 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:30:04.724091  7951 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:30:04.724150  7951 net.cpp:128] Top shape: 256 3 227 227 (39574272)
-I0906 13:30:04.724161  7951 net.cpp:128] Top shape: 256 (256)
-I0906 13:30:04.724165  7951 net.cpp:134] Memory required for data: 158298112
-I0906 13:30:04.724201  7951 layer_factory.hpp:74] Creating layer conv1
-I0906 13:30:04.724283  7951 net.cpp:91] Creating Layer conv1
-I0906 13:30:04.724328  7951 net.cpp:411] conv1 <- data
-I0906 13:30:04.724383  7951 net.cpp:369] conv1 -> conv1
-I0906 13:30:04.724417  7951 net.cpp:121] Setting up conv1
-I0906 13:30:04.729287  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
-I0906 13:30:04.729295  7951 net.cpp:134] Memory required for data: 455667712
-I0906 13:30:04.729333  7951 layer_factory.hpp:74] Creating layer relu1
-I0906 13:30:04.729357  7951 net.cpp:91] Creating Layer relu1
-I0906 13:30:04.729362  7951 net.cpp:411] relu1 <- conv1
-I0906 13:30:04.729377  7951 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:30:04.729385  7951 net.cpp:121] Setting up relu1
-I0906 13:30:04.729408  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
-I0906 13:30:04.729411  7951 net.cpp:134] Memory required for data: 753037312
-I0906 13:30:04.729416  7951 layer_factory.hpp:74] Creating layer norm1
-I0906 13:30:04.729444  7951 net.cpp:91] Creating Layer norm1
-I0906 13:30:04.729450  7951 net.cpp:411] norm1 <- conv1
-I0906 13:30:04.729463  7951 net.cpp:369] norm1 -> norm1
-I0906 13:30:04.729476  7951 net.cpp:121] Setting up norm1
-I0906 13:30:04.729499  7951 net.cpp:128] Top shape: 256 96 55 55 (74342400)
-I0906 13:30:04.729504  7951 net.cpp:134] Memory required for data: 1050406912
-I0906 13:30:04.729509  7951 layer_factory.hpp:74] Creating layer pool1
-I0906 13:30:04.729532  7951 net.cpp:91] Creating Layer pool1
-I0906 13:30:04.729537  7951 net.cpp:411] pool1 <- norm1
-I0906 13:30:04.729550  7951 net.cpp:369] pool1 -> pool1
-I0906 13:30:04.729564  7951 net.cpp:121] Setting up pool1
-I0906 13:30:04.729591  7951 net.cpp:128] Top shape: 256 96 27 27 (17915904)
-I0906 13:30:04.729596  7951 net.cpp:134] Memory required for data: 1122070528
-I0906 13:30:04.729600  7951 layer_factory.hpp:74] Creating layer conv2
-I0906 13:30:04.729614  7951 net.cpp:91] Creating Layer conv2
-I0906 13:30:04.729619  7951 net.cpp:411] conv2 <- pool1
-I0906 13:30:04.729635  7951 net.cpp:369] conv2 -> conv2
-I0906 13:30:04.729647  7951 net.cpp:121] Setting up conv2
-I0906 13:30:04.769634  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
-I0906 13:30:04.769649  7951 net.cpp:134] Memory required for data: 1313173504
-I0906 13:30:04.769673  7951 layer_factory.hpp:74] Creating layer relu2
-I0906 13:30:04.769695  7951 net.cpp:91] Creating Layer relu2
-I0906 13:30:04.769704  7951 net.cpp:411] relu2 <- conv2
-I0906 13:30:04.769722  7951 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:30:04.769736  7951 net.cpp:121] Setting up relu2
-I0906 13:30:04.769744  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
-I0906 13:30:04.769748  7951 net.cpp:134] Memory required for data: 1504276480
-I0906 13:30:04.769752  7951 layer_factory.hpp:74] Creating layer norm2
-I0906 13:30:04.769769  7951 net.cpp:91] Creating Layer norm2
-I0906 13:30:04.769775  7951 net.cpp:411] norm2 <- conv2
-I0906 13:30:04.769788  7951 net.cpp:369] norm2 -> norm2
-I0906 13:30:04.769800  7951 net.cpp:121] Setting up norm2
-I0906 13:30:04.769820  7951 net.cpp:128] Top shape: 256 256 27 27 (47775744)
-I0906 13:30:04.769825  7951 net.cpp:134] Memory required for data: 1695379456
-I0906 13:30:04.769829  7951 layer_factory.hpp:74] Creating layer pool2
-I0906 13:30:04.769850  7951 net.cpp:91] Creating Layer pool2
-I0906 13:30:04.769856  7951 net.cpp:411] pool2 <- norm2
-I0906 13:30:04.769870  7951 net.cpp:369] pool2 -> pool2
-I0906 13:30:04.769927  7951 net.cpp:121] Setting up pool2
-I0906 13:30:04.769944  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
-I0906 13:30:04.769949  7951 net.cpp:134] Memory required for data: 1739681792
-I0906 13:30:04.769953  7951 layer_factory.hpp:74] Creating layer conv3
-I0906 13:30:04.769975  7951 net.cpp:91] Creating Layer conv3
-I0906 13:30:04.769981  7951 net.cpp:411] conv3 <- pool2
-I0906 13:30:04.769996  7951 net.cpp:369] conv3 -> conv3
-I0906 13:30:04.770010  7951 net.cpp:121] Setting up conv3
-I0906 13:30:04.886401  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
-I0906 13:30:04.886425  7951 net.cpp:134] Memory required for data: 1806135296
-I0906 13:30:04.886471  7951 layer_factory.hpp:74] Creating layer relu3
-I0906 13:30:04.886507  7951 net.cpp:91] Creating Layer relu3
-I0906 13:30:04.886521  7951 net.cpp:411] relu3 <- conv3
-I0906 13:30:04.886548  7951 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:30:04.886565  7951 net.cpp:121] Setting up relu3
-I0906 13:30:04.886575  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
-I0906 13:30:04.886579  7951 net.cpp:134] Memory required for data: 1872588800
-I0906 13:30:04.886584  7951 layer_factory.hpp:74] Creating layer conv4
-I0906 13:30:04.886611  7951 net.cpp:91] Creating Layer conv4
-I0906 13:30:04.886617  7951 net.cpp:411] conv4 <- conv3
-I0906 13:30:04.886633  7951 net.cpp:369] conv4 -> conv4
-I0906 13:30:04.886648  7951 net.cpp:121] Setting up conv4
-I0906 13:30:04.973788  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
-I0906 13:30:04.973810  7951 net.cpp:134] Memory required for data: 1939042304
-I0906 13:30:04.973840  7951 layer_factory.hpp:74] Creating layer relu4
-I0906 13:30:04.973875  7951 net.cpp:91] Creating Layer relu4
-I0906 13:30:04.973891  7951 net.cpp:411] relu4 <- conv4
-I0906 13:30:04.973918  7951 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:30:04.973935  7951 net.cpp:121] Setting up relu4
-I0906 13:30:04.973945  7951 net.cpp:128] Top shape: 256 384 13 13 (16613376)
-I0906 13:30:04.973949  7951 net.cpp:134] Memory required for data: 2005495808
-I0906 13:30:04.973954  7951 layer_factory.hpp:74] Creating layer conv5
-I0906 13:30:04.973980  7951 net.cpp:91] Creating Layer conv5
-I0906 13:30:04.973986  7951 net.cpp:411] conv5 <- conv4
-I0906 13:30:04.974004  7951 net.cpp:369] conv5 -> conv5
-I0906 13:30:04.974019  7951 net.cpp:121] Setting up conv5
-I0906 13:30:05.032649  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
-I0906 13:30:05.032670  7951 net.cpp:134] Memory required for data: 2049798144
-I0906 13:30:05.032712  7951 layer_factory.hpp:74] Creating layer relu5
-I0906 13:30:05.032747  7951 net.cpp:91] Creating Layer relu5
-I0906 13:30:05.032763  7951 net.cpp:411] relu5 <- conv5
-I0906 13:30:05.032788  7951 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:30:05.032805  7951 net.cpp:121] Setting up relu5
-I0906 13:30:05.032814  7951 net.cpp:128] Top shape: 256 256 13 13 (11075584)
-I0906 13:30:05.032819  7951 net.cpp:134] Memory required for data: 2094100480
-I0906 13:30:05.032824  7951 layer_factory.hpp:74] Creating layer pool5
-I0906 13:30:05.032843  7951 net.cpp:91] Creating Layer pool5
-I0906 13:30:05.032850  7951 net.cpp:411] pool5 <- conv5
-I0906 13:30:05.032863  7951 net.cpp:369] pool5 -> pool5
-I0906 13:30:05.032877  7951 net.cpp:121] Setting up pool5
-I0906 13:30:05.032897  7951 net.cpp:128] Top shape: 256 256 6 6 (2359296)
-I0906 13:30:05.032902  7951 net.cpp:134] Memory required for data: 2103537664
-I0906 13:30:05.032907  7951 layer_factory.hpp:74] Creating layer fc6
-I0906 13:30:05.032945  7951 net.cpp:91] Creating Layer fc6
-I0906 13:30:05.032951  7951 net.cpp:411] fc6 <- pool5
-I0906 13:30:05.032966  7951 net.cpp:369] fc6 -> fc6
-I0906 13:30:05.032980  7951 net.cpp:121] Setting up fc6
-I0906 13:30:05.203193  7955 data_layer.cpp:120] Prefetch batch: 478 ms.
-I0906 13:30:05.203241  7955 data_layer.cpp:121]      Read time: 65.301 ms.
-I0906 13:30:05.203250  7955 data_layer.cpp:122] Transform time: 409.394 ms.
-I0906 13:30:09.817406  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:09.817432  7951 net.cpp:134] Memory required for data: 2107731968
-I0906 13:30:09.817504  7951 layer_factory.hpp:74] Creating layer relu6
-I0906 13:30:09.817538  7951 net.cpp:91] Creating Layer relu6
-I0906 13:30:09.817553  7951 net.cpp:411] relu6 <- fc6
-I0906 13:30:09.817579  7951 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:30:09.817595  7951 net.cpp:121] Setting up relu6
-I0906 13:30:09.817605  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:09.817608  7951 net.cpp:134] Memory required for data: 2111926272
-I0906 13:30:09.817613  7951 layer_factory.hpp:74] Creating layer drop6
-I0906 13:30:09.817643  7951 net.cpp:91] Creating Layer drop6
-I0906 13:30:09.817649  7951 net.cpp:411] drop6 <- fc6
-I0906 13:30:09.817662  7951 net.cpp:358] drop6 -> fc6 (in-place)
-I0906 13:30:09.817672  7951 net.cpp:121] Setting up drop6
-I0906 13:30:09.817692  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:09.817695  7951 net.cpp:134] Memory required for data: 2116120576
-I0906 13:30:09.817700  7951 layer_factory.hpp:74] Creating layer fc7
-I0906 13:30:09.817721  7951 net.cpp:91] Creating Layer fc7
-I0906 13:30:09.817728  7951 net.cpp:411] fc7 <- fc6
-I0906 13:30:09.817744  7951 net.cpp:369] fc7 -> fc7
-I0906 13:30:09.817759  7951 net.cpp:121] Setting up fc7
-I0906 13:30:11.938176  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:11.938201  7951 net.cpp:134] Memory required for data: 2120314880
-I0906 13:30:11.938230  7951 layer_factory.hpp:74] Creating layer relu7
-I0906 13:30:11.938263  7951 net.cpp:91] Creating Layer relu7
-I0906 13:30:11.938278  7951 net.cpp:411] relu7 <- fc7
-I0906 13:30:11.938305  7951 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:30:11.938321  7951 net.cpp:121] Setting up relu7
-I0906 13:30:11.938330  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:11.938334  7951 net.cpp:134] Memory required for data: 2124509184
-I0906 13:30:11.938339  7951 layer_factory.hpp:74] Creating layer drop7
-I0906 13:30:11.938355  7951 net.cpp:91] Creating Layer drop7
-I0906 13:30:11.938360  7951 net.cpp:411] drop7 <- fc7
-I0906 13:30:11.938372  7951 net.cpp:358] drop7 -> fc7 (in-place)
-I0906 13:30:11.938382  7951 net.cpp:121] Setting up drop7
-I0906 13:30:11.938397  7951 net.cpp:128] Top shape: 256 4096 (1048576)
-I0906 13:30:11.938401  7951 net.cpp:134] Memory required for data: 2128703488
-I0906 13:30:11.938406  7951 layer_factory.hpp:74] Creating layer fc8
-I0906 13:30:11.938427  7951 net.cpp:91] Creating Layer fc8
-I0906 13:30:11.938433  7951 net.cpp:411] fc8 <- fc7
-I0906 13:30:11.938449  7951 net.cpp:369] fc8 -> fc8
-I0906 13:30:11.938464  7951 net.cpp:121] Setting up fc8
-I0906 13:30:12.468230  7951 net.cpp:128] Top shape: 256 1000 (256000)
-I0906 13:30:12.468251  7951 net.cpp:134] Memory required for data: 2129727488
-I0906 13:30:12.468279  7951 layer_factory.hpp:74] Creating layer loss
-I0906 13:30:12.468333  7951 net.cpp:91] Creating Layer loss
-I0906 13:30:12.468348  7951 net.cpp:411] loss <- fc8
-I0906 13:30:12.468370  7951 net.cpp:411] loss <- label
-I0906 13:30:12.468389  7951 net.cpp:369] loss -> loss
-I0906 13:30:12.468408  7951 net.cpp:121] Setting up loss
-I0906 13:30:12.468426  7951 layer_factory.hpp:74] Creating layer loss
-I0906 13:30:12.469732  7951 net.cpp:128] Top shape: (1)
-I0906 13:30:12.469740  7951 net.cpp:130]     with loss weight 1
-I0906 13:30:12.469756  7951 net.cpp:134] Memory required for data: 2129727492
-I0906 13:30:12.469769  7951 net.cpp:193] loss needs backward computation.
-I0906 13:30:12.469779  7951 net.cpp:193] fc8 needs backward computation.
-I0906 13:30:12.469784  7951 net.cpp:193] drop7 needs backward computation.
-I0906 13:30:12.469791  7951 net.cpp:193] relu7 needs backward computation.
-I0906 13:30:12.469796  7951 net.cpp:193] fc7 needs backward computation.
-I0906 13:30:12.469808  7951 net.cpp:193] drop6 needs backward computation.
-I0906 13:30:12.469815  7951 net.cpp:193] relu6 needs backward computation.
-I0906 13:30:12.469820  7951 net.cpp:193] fc6 needs backward computation.
-I0906 13:30:12.469825  7951 net.cpp:193] pool5 needs backward computation.
-I0906 13:30:12.469830  7951 net.cpp:193] relu5 needs backward computation.
-I0906 13:30:12.469835  7951 net.cpp:193] conv5 needs backward computation.
-I0906 13:30:12.469882  7951 net.cpp:193] relu4 needs backward computation.
-I0906 13:30:12.469887  7951 net.cpp:193] conv4 needs backward computation.
-I0906 13:30:12.469893  7951 net.cpp:193] relu3 needs backward computation.
-I0906 13:30:12.469899  7951 net.cpp:193] conv3 needs backward computation.
-I0906 13:30:12.469907  7951 net.cpp:193] pool2 needs backward computation.
-I0906 13:30:12.469913  7951 net.cpp:193] norm2 needs backward computation.
-I0906 13:30:12.469918  7951 net.cpp:193] relu2 needs backward computation.
-I0906 13:30:12.469924  7951 net.cpp:193] conv2 needs backward computation.
-I0906 13:30:12.469930  7951 net.cpp:193] pool1 needs backward computation.
-I0906 13:30:12.469936  7951 net.cpp:193] norm1 needs backward computation.
-I0906 13:30:12.469943  7951 net.cpp:193] relu1 needs backward computation.
-I0906 13:30:12.469949  7951 net.cpp:193] conv1 needs backward computation.
-I0906 13:30:12.469955  7951 net.cpp:195] data does not need backward computation.
-I0906 13:30:12.469962  7951 net.cpp:236] This network produces output loss
-I0906 13:30:12.470002  7951 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:30:12.470018  7951 net.cpp:248] Network initialization done.
-I0906 13:30:12.470022  7951 net.cpp:249] Memory required for data: 2129727492
-I0906 13:30:12.470949  7951 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val.prototxt
-I0906 13:30:12.471081  7951 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
-I0906 13:30:12.471318  7951 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TEST
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "drop6"
-  type: "Dropout"
-  bottom: "fc6"
-  top: "fc6"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "drop7"
-  type: "Dropout"
-  bottom: "fc7"
-  top: "fc7"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "fc8"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:30:12.471688  7951 net.cpp:68] Memory required for data: 0
-I0906 13:30:12.471739  7951 layer_factory.hpp:74] Creating layer data
-I0906 13:30:12.471761  7951 net.cpp:91] Creating Layer data
-I0906 13:30:12.471772  7951 net.cpp:369] data -> data
-I0906 13:30:12.471796  7951 net.cpp:369] data -> label
-I0906 13:30:12.471810  7951 net.cpp:121] Setting up data
-I0906 13:30:12.471817  7951 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:30:12.482815  7951 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
-I0906 13:30:12.483065  7951 data_layer.cpp:53] output data size: 50,3,227,227
-I0906 13:30:12.546061  7951 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:30:12.546188  7951 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:30:12.546222  7951 net.cpp:128] Top shape: 50 3 227 227 (7729350)
-I0906 13:30:12.546231  7951 net.cpp:128] Top shape: 50 (50)
-I0906 13:30:12.546236  7951 net.cpp:134] Memory required for data: 30917600
-I0906 13:30:12.546268  7951 layer_factory.hpp:74] Creating layer label_data_1_split
-I0906 13:30:12.546334  7951 net.cpp:91] Creating Layer label_data_1_split
-I0906 13:30:12.546380  7951 net.cpp:411] label_data_1_split <- label
-I0906 13:30:12.546419  7951 net.cpp:369] label_data_1_split -> label_data_1_split_0
-I0906 13:30:12.546460  7951 net.cpp:369] label_data_1_split -> label_data_1_split_1
-I0906 13:30:12.546520  7951 net.cpp:121] Setting up label_data_1_split
-I0906 13:30:12.546551  7951 net.cpp:128] Top shape: 50 (50)
-I0906 13:30:12.546558  7951 net.cpp:128] Top shape: 50 (50)
-I0906 13:30:12.546561  7951 net.cpp:134] Memory required for data: 30918000
-I0906 13:30:12.546567  7951 layer_factory.hpp:74] Creating layer conv1
-I0906 13:30:12.546602  7951 net.cpp:91] Creating Layer conv1
-I0906 13:30:12.546608  7951 net.cpp:411] conv1 <- data
-I0906 13:30:12.546624  7951 net.cpp:369] conv1 -> conv1
-I0906 13:30:12.546638  7951 net.cpp:121] Setting up conv1
-I0906 13:30:12.551349  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:30:12.551354  7951 net.cpp:134] Memory required for data: 88998000
-I0906 13:30:12.551374  7951 layer_factory.hpp:74] Creating layer relu1
-I0906 13:30:12.551388  7951 net.cpp:91] Creating Layer relu1
-I0906 13:30:12.551393  7951 net.cpp:411] relu1 <- conv1
-I0906 13:30:12.551405  7951 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:30:12.551415  7951 net.cpp:121] Setting up relu1
-I0906 13:30:12.551422  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:30:12.551426  7951 net.cpp:134] Memory required for data: 147078000
-I0906 13:30:12.551431  7951 layer_factory.hpp:74] Creating layer norm1
-I0906 13:30:12.551451  7951 net.cpp:91] Creating Layer norm1
-I0906 13:30:12.551457  7951 net.cpp:411] norm1 <- conv1
-I0906 13:30:12.551470  7951 net.cpp:369] norm1 -> norm1
-I0906 13:30:12.551481  7951 net.cpp:121] Setting up norm1
-I0906 13:30:12.551499  7951 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:30:12.551504  7951 net.cpp:134] Memory required for data: 205158000
-I0906 13:30:12.551508  7951 layer_factory.hpp:74] Creating layer pool1
-I0906 13:30:12.551524  7951 net.cpp:91] Creating Layer pool1
-I0906 13:30:12.551530  7951 net.cpp:411] pool1 <- norm1
-I0906 13:30:12.551543  7951 net.cpp:369] pool1 -> pool1
-I0906 13:30:12.551553  7951 net.cpp:121] Setting up pool1
-I0906 13:30:12.551571  7951 net.cpp:128] Top shape: 50 96 27 27 (3499200)
-I0906 13:30:12.551576  7951 net.cpp:134] Memory required for data: 219154800
-I0906 13:30:12.551580  7951 layer_factory.hpp:74] Creating layer conv2
-I0906 13:30:12.551594  7951 net.cpp:91] Creating Layer conv2
-I0906 13:30:12.551600  7951 net.cpp:411] conv2 <- pool1
-I0906 13:30:12.551615  7951 net.cpp:369] conv2 -> conv2
-I0906 13:30:12.551627  7951 net.cpp:121] Setting up conv2
-I0906 13:30:12.591382  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:30:12.591404  7951 net.cpp:134] Memory required for data: 256479600
-I0906 13:30:12.591442  7951 layer_factory.hpp:74] Creating layer relu2
-I0906 13:30:12.591473  7951 net.cpp:91] Creating Layer relu2
-I0906 13:30:12.591486  7951 net.cpp:411] relu2 <- conv2
-I0906 13:30:12.591511  7951 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:30:12.591526  7951 net.cpp:121] Setting up relu2
-I0906 13:30:12.591536  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:30:12.591539  7951 net.cpp:134] Memory required for data: 293804400
-I0906 13:30:12.591544  7951 layer_factory.hpp:74] Creating layer norm2
-I0906 13:30:12.591572  7951 net.cpp:91] Creating Layer norm2
-I0906 13:30:12.591578  7951 net.cpp:411] norm2 <- conv2
-I0906 13:30:12.591591  7951 net.cpp:369] norm2 -> norm2
-I0906 13:30:12.591609  7951 net.cpp:121] Setting up norm2
-I0906 13:30:12.591629  7951 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:30:12.591634  7951 net.cpp:134] Memory required for data: 331129200
-I0906 13:30:12.591639  7951 layer_factory.hpp:74] Creating layer pool2
-I0906 13:30:12.591657  7951 net.cpp:91] Creating Layer pool2
-I0906 13:30:12.591663  7951 net.cpp:411] pool2 <- norm2
-I0906 13:30:12.591676  7951 net.cpp:369] pool2 -> pool2
-I0906 13:30:12.591687  7951 net.cpp:121] Setting up pool2
-I0906 13:30:12.591706  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:30:12.591709  7951 net.cpp:134] Memory required for data: 339782000
-I0906 13:30:12.591714  7951 layer_factory.hpp:74] Creating layer conv3
-I0906 13:30:12.591739  7951 net.cpp:91] Creating Layer conv3
-I0906 13:30:12.591744  7951 net.cpp:411] conv3 <- pool2
-I0906 13:30:12.591802  7951 net.cpp:369] conv3 -> conv3
-I0906 13:30:12.591814  7951 net.cpp:121] Setting up conv3
-I0906 13:30:12.640625  7956 data_layer.cpp:120] Prefetch batch: 94 ms.
-I0906 13:30:12.640658  7956 data_layer.cpp:121]      Read time: 12.07 ms.
-I0906 13:30:12.640666  7956 data_layer.cpp:122] Transform time: 81.163 ms.
-I0906 13:30:12.705313  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:30:12.705337  7951 net.cpp:134] Memory required for data: 352761200
-I0906 13:30:12.705377  7951 layer_factory.hpp:74] Creating layer relu3
-I0906 13:30:12.705410  7951 net.cpp:91] Creating Layer relu3
-I0906 13:30:12.705425  7951 net.cpp:411] relu3 <- conv3
-I0906 13:30:12.705451  7951 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:30:12.705466  7951 net.cpp:121] Setting up relu3
-I0906 13:30:12.705476  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:30:12.705479  7951 net.cpp:134] Memory required for data: 365740400
-I0906 13:30:12.705484  7951 layer_factory.hpp:74] Creating layer conv4
-I0906 13:30:12.705512  7951 net.cpp:91] Creating Layer conv4
-I0906 13:30:12.705518  7951 net.cpp:411] conv4 <- conv3
-I0906 13:30:12.705534  7951 net.cpp:369] conv4 -> conv4
-I0906 13:30:12.705549  7951 net.cpp:121] Setting up conv4
-I0906 13:30:12.789549  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:30:12.789571  7951 net.cpp:134] Memory required for data: 378719600
-I0906 13:30:12.789597  7951 layer_factory.hpp:74] Creating layer relu4
-I0906 13:30:12.789631  7951 net.cpp:91] Creating Layer relu4
-I0906 13:30:12.789646  7951 net.cpp:411] relu4 <- conv4
-I0906 13:30:12.789674  7951 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:30:12.789690  7951 net.cpp:121] Setting up relu4
-I0906 13:30:12.789698  7951 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:30:12.789701  7951 net.cpp:134] Memory required for data: 391698800
-I0906 13:30:12.789706  7951 layer_factory.hpp:74] Creating layer conv5
-I0906 13:30:12.789732  7951 net.cpp:91] Creating Layer conv5
-I0906 13:30:12.789738  7951 net.cpp:411] conv5 <- conv4
-I0906 13:30:12.789754  7951 net.cpp:369] conv5 -> conv5
-I0906 13:30:12.789770  7951 net.cpp:121] Setting up conv5
-I0906 13:30:12.846217  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:30:12.846233  7951 net.cpp:134] Memory required for data: 400351600
-I0906 13:30:12.846271  7951 layer_factory.hpp:74] Creating layer relu5
-I0906 13:30:12.846298  7951 net.cpp:91] Creating Layer relu5
-I0906 13:30:12.846312  7951 net.cpp:411] relu5 <- conv5
-I0906 13:30:12.846335  7951 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:30:12.846350  7951 net.cpp:121] Setting up relu5
-I0906 13:30:12.846359  7951 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:30:12.846362  7951 net.cpp:134] Memory required for data: 409004400
-I0906 13:30:12.846367  7951 layer_factory.hpp:74] Creating layer pool5
-I0906 13:30:12.846397  7951 net.cpp:91] Creating Layer pool5
-I0906 13:30:12.846402  7951 net.cpp:411] pool5 <- conv5
-I0906 13:30:12.846417  7951 net.cpp:369] pool5 -> pool5
-I0906 13:30:12.846431  7951 net.cpp:121] Setting up pool5
-I0906 13:30:12.846451  7951 net.cpp:128] Top shape: 50 256 6 6 (460800)
-I0906 13:30:12.846454  7951 net.cpp:134] Memory required for data: 410847600
-I0906 13:30:12.846459  7951 layer_factory.hpp:74] Creating layer fc6
-I0906 13:30:12.846479  7951 net.cpp:91] Creating Layer fc6
-I0906 13:30:12.846485  7951 net.cpp:411] fc6 <- pool5
-I0906 13:30:12.846499  7951 net.cpp:369] fc6 -> fc6
-I0906 13:30:12.846513  7951 net.cpp:121] Setting up fc6
-I0906 13:30:17.661206  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:17.661231  7951 net.cpp:134] Memory required for data: 411666800
-I0906 13:30:17.661259  7951 layer_factory.hpp:74] Creating layer relu6
-I0906 13:30:17.661293  7951 net.cpp:91] Creating Layer relu6
-I0906 13:30:17.661309  7951 net.cpp:411] relu6 <- fc6
-I0906 13:30:17.661334  7951 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:30:17.661350  7951 net.cpp:121] Setting up relu6
-I0906 13:30:17.661360  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:17.661363  7951 net.cpp:134] Memory required for data: 412486000
-I0906 13:30:17.661412  7951 layer_factory.hpp:74] Creating layer drop6
-I0906 13:30:17.661428  7951 net.cpp:91] Creating Layer drop6
-I0906 13:30:17.661434  7951 net.cpp:411] drop6 <- fc6
-I0906 13:30:17.661447  7951 net.cpp:358] drop6 -> fc6 (in-place)
-I0906 13:30:17.661456  7951 net.cpp:121] Setting up drop6
-I0906 13:30:17.661470  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:17.661475  7951 net.cpp:134] Memory required for data: 413305200
-I0906 13:30:17.661480  7951 layer_factory.hpp:74] Creating layer fc7
-I0906 13:30:17.661501  7951 net.cpp:91] Creating Layer fc7
-I0906 13:30:17.661507  7951 net.cpp:411] fc7 <- fc6
-I0906 13:30:17.661523  7951 net.cpp:369] fc7 -> fc7
-I0906 13:30:17.661540  7951 net.cpp:121] Setting up fc7
-I0906 13:30:19.790464  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:19.790488  7951 net.cpp:134] Memory required for data: 414124400
-I0906 13:30:19.790514  7951 layer_factory.hpp:74] Creating layer relu7
-I0906 13:30:19.790547  7951 net.cpp:91] Creating Layer relu7
-I0906 13:30:19.790563  7951 net.cpp:411] relu7 <- fc7
-I0906 13:30:19.790591  7951 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:30:19.790607  7951 net.cpp:121] Setting up relu7
-I0906 13:30:19.790616  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:19.790621  7951 net.cpp:134] Memory required for data: 414943600
-I0906 13:30:19.790624  7951 layer_factory.hpp:74] Creating layer drop7
-I0906 13:30:19.790639  7951 net.cpp:91] Creating Layer drop7
-I0906 13:30:19.790645  7951 net.cpp:411] drop7 <- fc7
-I0906 13:30:19.790657  7951 net.cpp:358] drop7 -> fc7 (in-place)
-I0906 13:30:19.790668  7951 net.cpp:121] Setting up drop7
-I0906 13:30:19.790683  7951 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:30:19.790688  7951 net.cpp:134] Memory required for data: 415762800
-I0906 13:30:19.790691  7951 layer_factory.hpp:74] Creating layer fc8
-I0906 13:30:19.790714  7951 net.cpp:91] Creating Layer fc8
-I0906 13:30:19.790719  7951 net.cpp:411] fc8 <- fc7
-I0906 13:30:19.790735  7951 net.cpp:369] fc8 -> fc8
-I0906 13:30:19.790760  7951 net.cpp:121] Setting up fc8
-I0906 13:30:20.310474  7951 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:30:20.310497  7951 net.cpp:134] Memory required for data: 415962800
-I0906 13:30:20.310523  7951 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
-I0906 13:30:20.310555  7951 net.cpp:91] Creating Layer fc8_fc8_0_split
-I0906 13:30:20.310570  7951 net.cpp:411] fc8_fc8_0_split <- fc8
-I0906 13:30:20.310598  7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
-I0906 13:30:20.310621  7951 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
-I0906 13:30:20.310633  7951 net.cpp:121] Setting up fc8_fc8_0_split
-I0906 13:30:20.310650  7951 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:30:20.310657  7951 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:30:20.310660  7951 net.cpp:134] Memory required for data: 416362800
-I0906 13:30:20.310665  7951 layer_factory.hpp:74] Creating layer accuracy
-I0906 13:30:20.310698  7951 net.cpp:91] Creating Layer accuracy
-I0906 13:30:20.310704  7951 net.cpp:411] accuracy <- fc8_fc8_0_split_0
-I0906 13:30:20.310715  7951 net.cpp:411] accuracy <- label_data_1_split_0
-I0906 13:30:20.310729  7951 net.cpp:369] accuracy -> accuracy
-I0906 13:30:20.310740  7951 net.cpp:121] Setting up accuracy
-I0906 13:30:20.310756  7951 net.cpp:128] Top shape: (1)
-I0906 13:30:20.310760  7951 net.cpp:134] Memory required for data: 416362804
-I0906 13:30:20.310765  7951 layer_factory.hpp:74] Creating layer loss
-I0906 13:30:20.310777  7951 net.cpp:91] Creating Layer loss
-I0906 13:30:20.310782  7951 net.cpp:411] loss <- fc8_fc8_0_split_1
-I0906 13:30:20.310793  7951 net.cpp:411] loss <- label_data_1_split_1
-I0906 13:30:20.310804  7951 net.cpp:369] loss -> loss
-I0906 13:30:20.310816  7951 net.cpp:121] Setting up loss
-I0906 13:30:20.310825  7951 layer_factory.hpp:74] Creating layer loss
-I0906 13:30:20.311178  7951 net.cpp:128] Top shape: (1)
-I0906 13:30:20.311183  7951 net.cpp:130]     with loss weight 1
-I0906 13:30:20.311200  7951 net.cpp:134] Memory required for data: 416362808
-I0906 13:30:20.311250  7951 net.cpp:193] loss needs backward computation.
-I0906 13:30:20.311259  7951 net.cpp:195] accuracy does not need backward computation.
-I0906 13:30:20.311265  7951 net.cpp:193] fc8_fc8_0_split needs backward computation.
-I0906 13:30:20.311271  7951 net.cpp:193] fc8 needs backward computation.
-I0906 13:30:20.311277  7951 net.cpp:193] drop7 needs backward computation.
-I0906 13:30:20.311282  7951 net.cpp:193] relu7 needs backward computation.
-I0906 13:30:20.311288  7951 net.cpp:193] fc7 needs backward computation.
-I0906 13:30:20.311295  7951 net.cpp:193] drop6 needs backward computation.
-I0906 13:30:20.311300  7951 net.cpp:193] relu6 needs backward computation.
-I0906 13:30:20.311305  7951 net.cpp:193] fc6 needs backward computation.
-I0906 13:30:20.311311  7951 net.cpp:193] pool5 needs backward computation.
-I0906 13:30:20.311317  7951 net.cpp:193] relu5 needs backward computation.
-I0906 13:30:20.311322  7951 net.cpp:193] conv5 needs backward computation.
-I0906 13:30:20.311328  7951 net.cpp:193] relu4 needs backward computation.
-I0906 13:30:20.311333  7951 net.cpp:193] conv4 needs backward computation.
-I0906 13:30:20.311339  7951 net.cpp:193] relu3 needs backward computation.
-I0906 13:30:20.311345  7951 net.cpp:193] conv3 needs backward computation.
-I0906 13:30:20.311352  7951 net.cpp:193] pool2 needs backward computation.
-I0906 13:30:20.311357  7951 net.cpp:193] norm2 needs backward computation.
-I0906 13:30:20.311363  7951 net.cpp:193] relu2 needs backward computation.
-I0906 13:30:20.311368  7951 net.cpp:193] conv2 needs backward computation.
-I0906 13:30:20.311374  7951 net.cpp:193] pool1 needs backward computation.
-I0906 13:30:20.311380  7951 net.cpp:193] norm1 needs backward computation.
-I0906 13:30:20.311386  7951 net.cpp:193] relu1 needs backward computation.
-I0906 13:30:20.311391  7951 net.cpp:193] conv1 needs backward computation.
-I0906 13:30:20.311399  7951 net.cpp:195] label_data_1_split does not need backward computation.
-I0906 13:30:20.311406  7951 net.cpp:195] data does not need backward computation.
-I0906 13:30:20.311411  7951 net.cpp:236] This network produces output accuracy
-I0906 13:30:20.311419  7951 net.cpp:236] This network produces output loss
-I0906 13:30:20.311455  7951 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:30:20.311468  7951 net.cpp:248] Network initialization done.
-I0906 13:30:20.311472  7951 net.cpp:249] Memory required for data: 416362808
-I0906 13:30:20.311663  7951 solver.cpp:53] Solver scaffolding done.
-I0906 13:30:20.311787  7951 solver.cpp:270] Solving AlexNet
-I0906 13:30:20.311791  7951 solver.cpp:271] Learning Rate Policy: step
-I0906 13:30:20.313592  7951 solver.cpp:314] Iteration 0, Testing net (#0)
-I0906 13:30:20.313630  7951 net.cpp:696] Copying source layer data
-I0906 13:30:20.313635  7951 net.cpp:696] Copying source layer conv1
-I0906 13:30:20.316704  7951 net.cpp:696] Copying source layer relu1
-I0906 13:30:20.316743  7951 net.cpp:696] Copying source layer norm1
-I0906 13:30:20.316756  7951 net.cpp:696] Copying source layer pool1
-I0906 13:30:20.316766  7951 net.cpp:696] Copying source layer conv2
-I0906 13:30:20.317158  7951 net.cpp:696] Copying source layer relu2
-I0906 13:30:20.317173  7951 net.cpp:696] Copying source layer norm2
-I0906 13:30:20.317183  7951 net.cpp:696] Copying source layer pool2
-I0906 13:30:20.317193  7951 net.cpp:696] Copying source layer conv3
-I0906 13:30:20.317970  7951 net.cpp:696] Copying source layer relu3
-I0906 13:30:20.317983  7951 net.cpp:696] Copying source layer conv4
-I0906 13:30:20.318357  7951 net.cpp:696] Copying source layer relu4
-I0906 13:30:20.318372  7951 net.cpp:696] Copying source layer conv5
-I0906 13:30:20.318827  7951 net.cpp:696] Copying source layer relu5
-I0906 13:30:20.318840  7951 net.cpp:696] Copying source layer pool5
-I0906 13:30:20.318850  7951 net.cpp:696] Copying source layer fc6
-I0906 13:30:20.336436  7951 net.cpp:696] Copying source layer relu6
-I0906 13:30:20.336460  7951 net.cpp:696] Copying source layer drop6
-I0906 13:30:20.336467  7951 net.cpp:696] Copying sou
\ No newline at end of file
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
deleted file mode 100644
index b99da3d4..00000000
--- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133358.8300
+++ /dev/null
@@ -1,1208 +0,0 @@
-Log file created at: 2015/09/06 13:33:58
-Running on machine: AMD-RESEARCH
-Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
-I0906 13:33:58.858449  8300 caffe.cpp:114] Use GPU with device ID 0
-I0906 13:33:58.896994  8300 device.cpp:230] Number of platforms found:1
-I0906 13:33:58.897037  8300 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
-I0906 13:33:58.897054  8300 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
-I0906 13:33:58.897061  8300 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
-I0906 13:33:58.897068  8300 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
-I0906 13:33:58.897075  8300 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
-I0906 13:33:58.897086  8300 device.cpp:286] Number of devices found:1
-I0906 13:33:58.897092  8300 device.cpp:288] 	DeviceID:	0x163a250
-I0906 13:33:58.897126  8300 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
-I0906 13:33:58.897137  8300 device.cpp:393] 	Is it integrated GPU?:	0
-I0906 13:33:58.897145  8300 device.cpp:393] 	Max clock frequency MHz:	930
-I0906 13:33:58.897151  8300 device.cpp:393] 	Host-Device unified mem:	0
-I0906 13:33:58.897157  8300 device.cpp:393] 	ECC support:	0
-I0906 13:33:58.897164  8300 device.cpp:393] 	Endian little:	1
-I0906 13:33:58.897171  8300 device.cpp:393] 	Max compute units:	44
-I0906 13:33:58.897177  8300 device.cpp:393] 	Max work group size:	256
-I0906 13:33:58.897186  8300 device.cpp:393] 	Max work item dimensions:	3
-I0906 13:33:58.897192  8300 device.cpp:393] 	Max work item sizes:	0x100
-I0906 13:33:58.897202  8300 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
-I0906 13:33:58.897209  8300 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
-I0906 13:33:58.897215  8300 device.cpp:393] 	Max mem alloc size:	4244635648
-I0906 13:33:58.897222  8300 device.cpp:393] 	Global mem size:	16878927872
-I0906 13:33:58.897228  8300 device.cpp:393] 	Local mem size:	32768
-I0906 13:33:58.897241  8300 device.cpp:96] Picked device type : GPU 0
-I0906 13:34:01.301823  8300 device.cpp:152] Build Program
-I0906 13:34:01.302049  8300 caffe.cpp:122] Starting Optimization
-I0906 13:34:01.302139  8300 solver.cpp:40] Initializing solver from parameters: 
-test_iter: 1
-test_interval: 1000
-base_lr: 0.01
-display: 1
-max_iter: 10
-lr_policy: "step"
-gamma: 0.1
-momentum: 0.9
-weight_decay: 0.0005
-stepsize: 100000
-snapshot: 10000
-snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
-solver_mode: GPU
-net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
-I0906 13:34:01.302249  8300 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:34:01.303269  8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
-I0906 13:34:01.303316  8300 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
-I0906 13:34:01.303493  8300 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TRAIN
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:34:01.303913  8300 net.cpp:68] Memory required for data: 0
-I0906 13:34:01.304132  8300 layer_factory.hpp:74] Creating layer data
-I0906 13:34:01.304185  8300 net.cpp:91] Creating Layer data
-I0906 13:34:01.304205  8300 net.cpp:369] data -> data
-I0906 13:34:01.304306  8300 net.cpp:369] data -> label
-I0906 13:34:01.304328  8300 net.cpp:121] Setting up data
-I0906 13:34:01.304342  8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:34:01.318087  8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
-I0906 13:34:01.318596  8300 data_layer.cpp:53] output data size: 100,3,227,227
-I0906 13:34:01.351816  8300 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:34:01.352555  8300 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:34:01.352643  8300 net.cpp:128] Top shape: 100 3 227 227 (15458700)
-I0906 13:34:01.352655  8300 net.cpp:128] Top shape: 100 (100)
-I0906 13:34:01.352660  8300 net.cpp:134] Memory required for data: 61835200
-I0906 13:34:01.352697  8300 layer_factory.hpp:74] Creating layer conv1
-I0906 13:34:01.352783  8300 net.cpp:91] Creating Layer conv1
-I0906 13:34:01.352808  8300 net.cpp:411] conv1 <- data
-I0906 13:34:01.352902  8300 net.cpp:369] conv1 -> conv1
-I0906 13:34:01.352937  8300 net.cpp:121] Setting up conv1
-I0906 13:34:01.357744  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:01.357751  8300 net.cpp:134] Memory required for data: 177995200
-I0906 13:34:01.357791  8300 layer_factory.hpp:74] Creating layer relu1
-I0906 13:34:01.357815  8300 net.cpp:91] Creating Layer relu1
-I0906 13:34:01.357820  8300 net.cpp:411] relu1 <- conv1
-I0906 13:34:01.357833  8300 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:34:01.357843  8300 net.cpp:121] Setting up relu1
-I0906 13:34:01.357851  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:01.357856  8300 net.cpp:134] Memory required for data: 294155200
-I0906 13:34:01.357861  8300 layer_factory.hpp:74] Creating layer norm1
-I0906 13:34:01.357890  8300 net.cpp:91] Creating Layer norm1
-I0906 13:34:01.357895  8300 net.cpp:411] norm1 <- conv1
-I0906 13:34:01.357908  8300 net.cpp:369] norm1 -> norm1
-I0906 13:34:01.357920  8300 net.cpp:121] Setting up norm1
-I0906 13:34:01.357944  8300 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:01.357949  8300 net.cpp:134] Memory required for data: 410315200
-I0906 13:34:01.357954  8300 layer_factory.hpp:74] Creating layer pool1
-I0906 13:34:01.357978  8300 net.cpp:91] Creating Layer pool1
-I0906 13:34:01.357985  8300 net.cpp:411] pool1 <- norm1
-I0906 13:34:01.357996  8300 net.cpp:369] pool1 -> pool1
-I0906 13:34:01.358010  8300 net.cpp:121] Setting up pool1
-I0906 13:34:01.358038  8300 net.cpp:128] Top shape: 100 96 27 27 (6998400)
-I0906 13:34:01.358042  8300 net.cpp:134] Memory required for data: 438308800
-I0906 13:34:01.358047  8300 layer_factory.hpp:74] Creating layer conv2
-I0906 13:34:01.358060  8300 net.cpp:91] Creating Layer conv2
-I0906 13:34:01.358067  8300 net.cpp:411] conv2 <- pool1
-I0906 13:34:01.358079  8300 net.cpp:369] conv2 -> conv2
-I0906 13:34:01.358091  8300 net.cpp:121] Setting up conv2
-I0906 13:34:01.397493  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:01.397511  8300 net.cpp:134] Memory required for data: 512958400
-I0906 13:34:01.397541  8300 layer_factory.hpp:74] Creating layer relu2
-I0906 13:34:01.397567  8300 net.cpp:91] Creating Layer relu2
-I0906 13:34:01.397578  8300 net.cpp:411] relu2 <- conv2
-I0906 13:34:01.397599  8300 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:34:01.397613  8300 net.cpp:121] Setting up relu2
-I0906 13:34:01.397621  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:01.397626  8300 net.cpp:134] Memory required for data: 587608000
-I0906 13:34:01.397631  8300 layer_factory.hpp:74] Creating layer norm2
-I0906 13:34:01.397649  8300 net.cpp:91] Creating Layer norm2
-I0906 13:34:01.397655  8300 net.cpp:411] norm2 <- conv2
-I0906 13:34:01.397667  8300 net.cpp:369] norm2 -> norm2
-I0906 13:34:01.397680  8300 net.cpp:121] Setting up norm2
-I0906 13:34:01.397699  8300 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:01.397704  8300 net.cpp:134] Memory required for data: 662257600
-I0906 13:34:01.397709  8300 layer_factory.hpp:74] Creating layer pool2
-I0906 13:34:01.397729  8300 net.cpp:91] Creating Layer pool2
-I0906 13:34:01.397735  8300 net.cpp:411] pool2 <- norm2
-I0906 13:34:01.397748  8300 net.cpp:369] pool2 -> pool2
-I0906 13:34:01.397758  8300 net.cpp:121] Setting up pool2
-I0906 13:34:01.397776  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:01.397780  8300 net.cpp:134] Memory required for data: 679563200
-I0906 13:34:01.397830  8300 layer_factory.hpp:74] Creating layer conv3
-I0906 13:34:01.397851  8300 net.cpp:91] Creating Layer conv3
-I0906 13:34:01.397857  8300 net.cpp:411] conv3 <- pool2
-I0906 13:34:01.397871  8300 net.cpp:369] conv3 -> conv3
-I0906 13:34:01.397886  8300 net.cpp:121] Setting up conv3
-I0906 13:34:01.513005  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:01.513030  8300 net.cpp:134] Memory required for data: 705521600
-I0906 13:34:01.513072  8300 layer_factory.hpp:74] Creating layer relu3
-I0906 13:34:01.513104  8300 net.cpp:91] Creating Layer relu3
-I0906 13:34:01.513120  8300 net.cpp:411] relu3 <- conv3
-I0906 13:34:01.513149  8300 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:34:01.513164  8300 net.cpp:121] Setting up relu3
-I0906 13:34:01.513173  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:01.513177  8300 net.cpp:134] Memory required for data: 731480000
-I0906 13:34:01.513182  8300 layer_factory.hpp:74] Creating layer conv4
-I0906 13:34:01.513208  8300 net.cpp:91] Creating Layer conv4
-I0906 13:34:01.513214  8300 net.cpp:411] conv4 <- conv3
-I0906 13:34:01.513229  8300 net.cpp:369] conv4 -> conv4
-I0906 13:34:01.513244  8300 net.cpp:121] Setting up conv4
-I0906 13:34:01.539248  8304 data_layer.cpp:120] Prefetch batch: 186 ms.
-I0906 13:34:01.539295  8304 data_layer.cpp:121]      Read time: 22.695 ms.
-I0906 13:34:01.539304  8304 data_layer.cpp:122] Transform time: 161.707 ms.
-I0906 13:34:01.598980  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:01.599004  8300 net.cpp:134] Memory required for data: 757438400
-I0906 13:34:01.599028  8300 layer_factory.hpp:74] Creating layer relu4
-I0906 13:34:01.599059  8300 net.cpp:91] Creating Layer relu4
-I0906 13:34:01.599074  8300 net.cpp:411] relu4 <- conv4
-I0906 13:34:01.599100  8300 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:34:01.599117  8300 net.cpp:121] Setting up relu4
-I0906 13:34:01.599125  8300 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:01.599129  8300 net.cpp:134] Memory required for data: 783396800
-I0906 13:34:01.599134  8300 layer_factory.hpp:74] Creating layer conv5
-I0906 13:34:01.599158  8300 net.cpp:91] Creating Layer conv5
-I0906 13:34:01.599164  8300 net.cpp:411] conv5 <- conv4
-I0906 13:34:01.599177  8300 net.cpp:369] conv5 -> conv5
-I0906 13:34:01.599191  8300 net.cpp:121] Setting up conv5
-I0906 13:34:01.658185  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:01.658205  8300 net.cpp:134] Memory required for data: 800702400
-I0906 13:34:01.658242  8300 layer_factory.hpp:74] Creating layer relu5
-I0906 13:34:01.658269  8300 net.cpp:91] Creating Layer relu5
-I0906 13:34:01.658283  8300 net.cpp:411] relu5 <- conv5
-I0906 13:34:01.658308  8300 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:34:01.658321  8300 net.cpp:121] Setting up relu5
-I0906 13:34:01.658330  8300 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:01.658334  8300 net.cpp:134] Memory required for data: 818008000
-I0906 13:34:01.658339  8300 layer_factory.hpp:74] Creating layer pool5
-I0906 13:34:01.658357  8300 net.cpp:91] Creating Layer pool5
-I0906 13:34:01.658362  8300 net.cpp:411] pool5 <- conv5
-I0906 13:34:01.658375  8300 net.cpp:369] pool5 -> pool5
-I0906 13:34:01.658390  8300 net.cpp:121] Setting up pool5
-I0906 13:34:01.658407  8300 net.cpp:128] Top shape: 100 256 6 6 (921600)
-I0906 13:34:01.658412  8300 net.cpp:134] Memory required for data: 821694400
-I0906 13:34:01.658416  8300 layer_factory.hpp:74] Creating layer fc6
-I0906 13:34:01.658447  8300 net.cpp:91] Creating Layer fc6
-I0906 13:34:01.658453  8300 net.cpp:411] fc6 <- pool5
-I0906 13:34:01.658466  8300 net.cpp:369] fc6 -> fc6
-I0906 13:34:01.658480  8300 net.cpp:121] Setting up fc6
-I0906 13:34:06.571331  8300 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:06.571354  8300 net.cpp:134] Memory required for data: 823332800
-I0906 13:34:06.571382  8300 layer_factory.hpp:74] Creating layer relu6
-I0906 13:34:06.571415  8300 net.cpp:91] Creating Layer relu6
-I0906 13:34:06.571430  8300 net.cpp:411] relu6 <- fc6
-I0906 13:34:06.571456  8300 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:34:06.571521  8300 net.cpp:121] Setting up relu6
-I0906 13:34:06.571529  8300 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:06.571533  8300 net.cpp:134] Memory required for data: 824971200
-I0906 13:34:06.571538  8300 layer_factory.hpp:74] Creating layer fc7
-I0906 13:34:06.571558  8300 net.cpp:91] Creating Layer fc7
-I0906 13:34:06.571563  8300 net.cpp:411] fc7 <- fc6
-I0906 13:34:06.571578  8300 net.cpp:369] fc7 -> fc7
-I0906 13:34:06.571593  8300 net.cpp:121] Setting up fc7
-I0906 13:34:08.751106  8300 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:08.751129  8300 net.cpp:134] Memory required for data: 826609600
-I0906 13:34:08.751155  8300 layer_factory.hpp:74] Creating layer relu7
-I0906 13:34:08.751186  8300 net.cpp:91] Creating Layer relu7
-I0906 13:34:08.751202  8300 net.cpp:411] relu7 <- fc7
-I0906 13:34:08.751229  8300 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:34:08.751243  8300 net.cpp:121] Setting up relu7
-I0906 13:34:08.751251  8300 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:08.751255  8300 net.cpp:134] Memory required for data: 828248000
-I0906 13:34:08.751260  8300 layer_factory.hpp:74] Creating layer fc8
-I0906 13:34:08.751281  8300 net.cpp:91] Creating Layer fc8
-I0906 13:34:08.751286  8300 net.cpp:411] fc8 <- fc7
-I0906 13:34:08.751301  8300 net.cpp:369] fc8 -> fc8
-I0906 13:34:08.751315  8300 net.cpp:121] Setting up fc8
-I0906 13:34:09.287158  8300 net.cpp:128] Top shape: 100 1000 (100000)
-I0906 13:34:09.287181  8300 net.cpp:134] Memory required for data: 828648000
-I0906 13:34:09.287209  8300 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:09.287257  8300 net.cpp:91] Creating Layer loss
-I0906 13:34:09.287272  8300 net.cpp:411] loss <- fc8
-I0906 13:34:09.287295  8300 net.cpp:411] loss <- label
-I0906 13:34:09.287313  8300 net.cpp:369] loss -> loss
-I0906 13:34:09.287333  8300 net.cpp:121] Setting up loss
-I0906 13:34:09.287349  8300 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:09.287860  8300 net.cpp:128] Top shape: (1)
-I0906 13:34:09.287865  8300 net.cpp:130]     with loss weight 1
-I0906 13:34:09.287881  8300 net.cpp:134] Memory required for data: 828648004
-I0906 13:34:09.287890  8300 net.cpp:193] loss needs backward computation.
-I0906 13:34:09.287899  8300 net.cpp:193] fc8 needs backward computation.
-I0906 13:34:09.287904  8300 net.cpp:193] relu7 needs backward computation.
-I0906 13:34:09.287910  8300 net.cpp:193] fc7 needs backward computation.
-I0906 13:34:09.287916  8300 net.cpp:193] relu6 needs backward computation.
-I0906 13:34:09.287921  8300 net.cpp:193] fc6 needs backward computation.
-I0906 13:34:09.287935  8300 net.cpp:193] pool5 needs backward computation.
-I0906 13:34:09.287940  8300 net.cpp:193] relu5 needs backward computation.
-I0906 13:34:09.287946  8300 net.cpp:193] conv5 needs backward computation.
-I0906 13:34:09.287952  8300 net.cpp:193] relu4 needs backward computation.
-I0906 13:34:09.287958  8300 net.cpp:193] conv4 needs backward computation.
-I0906 13:34:09.287964  8300 net.cpp:193] relu3 needs backward computation.
-I0906 13:34:09.287969  8300 net.cpp:193] conv3 needs backward computation.
-I0906 13:34:09.287977  8300 net.cpp:193] pool2 needs backward computation.
-I0906 13:34:09.287983  8300 net.cpp:193] norm2 needs backward computation.
-I0906 13:34:09.287989  8300 net.cpp:193] relu2 needs backward computation.
-I0906 13:34:09.287996  8300 net.cpp:193] conv2 needs backward computation.
-I0906 13:34:09.288002  8300 net.cpp:193] pool1 needs backward computation.
-I0906 13:34:09.288007  8300 net.cpp:193] norm1 needs backward computation.
-I0906 13:34:09.288014  8300 net.cpp:193] relu1 needs backward computation.
-I0906 13:34:09.288019  8300 net.cpp:193] conv1 needs backward computation.
-I0906 13:34:09.288028  8300 net.cpp:195] data does not need backward computation.
-I0906 13:34:09.288034  8300 net.cpp:236] This network produces output loss
-I0906 13:34:09.288067  8300 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:34:09.288084  8300 net.cpp:248] Network initialization done.
-I0906 13:34:09.288087  8300 net.cpp:249] Memory required for data: 828648004
-I0906 13:34:09.289022  8300 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:34:09.289130  8300 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
-I0906 13:34:09.289348  8300 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TEST
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "fc8"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:34:09.289656  8300 net.cpp:68] Memory required for data: 0
-I0906 13:34:09.289702  8300 layer_factory.hpp:74] Creating layer data
-I0906 13:34:09.289721  8300 net.cpp:91] Creating Layer data
-I0906 13:34:09.289731  8300 net.cpp:369] data -> data
-I0906 13:34:09.289752  8300 net.cpp:369] data -> label
-I0906 13:34:09.289764  8300 net.cpp:121] Setting up data
-I0906 13:34:09.289772  8300 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:34:09.298058  8300 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
-I0906 13:34:09.298318  8300 data_layer.cpp:53] output data size: 50,3,227,227
-I0906 13:34:09.314699  8300 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:34:09.314806  8300 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:34:09.314834  8300 net.cpp:128] Top shape: 50 3 227 227 (7729350)
-I0906 13:34:09.314843  8300 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:09.314848  8300 net.cpp:134] Memory required for data: 30917600
-I0906 13:34:09.314882  8300 layer_factory.hpp:74] Creating layer label_data_1_split
-I0906 13:34:09.314973  8300 net.cpp:91] Creating Layer label_data_1_split
-I0906 13:34:09.314997  8300 net.cpp:411] label_data_1_split <- label
-I0906 13:34:09.315035  8300 net.cpp:369] label_data_1_split -> label_data_1_split_0
-I0906 13:34:09.315073  8300 net.cpp:369] label_data_1_split -> label_data_1_split_1
-I0906 13:34:09.315085  8300 net.cpp:121] Setting up label_data_1_split
-I0906 13:34:09.315116  8300 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:09.315124  8300 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:09.315127  8300 net.cpp:134] Memory required for data: 30918000
-I0906 13:34:09.315131  8300 layer_factory.hpp:74] Creating layer conv1
-I0906 13:34:09.315165  8300 net.cpp:91] Creating Layer conv1
-I0906 13:34:09.315171  8300 net.cpp:411] conv1 <- data
-I0906 13:34:09.315183  8300 net.cpp:369] conv1 -> conv1
-I0906 13:34:09.315198  8300 net.cpp:121] Setting up conv1
-I0906 13:34:09.319859  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:09.319864  8300 net.cpp:134] Memory required for data: 88998000
-I0906 13:34:09.319883  8300 layer_factory.hpp:74] Creating layer relu1
-I0906 13:34:09.319895  8300 net.cpp:91] Creating Layer relu1
-I0906 13:34:09.319901  8300 net.cpp:411] relu1 <- conv1
-I0906 13:34:09.319913  8300 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:34:09.319926  8300 net.cpp:121] Setting up relu1
-I0906 13:34:09.319933  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:09.319937  8300 net.cpp:134] Memory required for data: 147078000
-I0906 13:34:09.319942  8300 layer_factory.hpp:74] Creating layer norm1
-I0906 13:34:09.319962  8300 net.cpp:91] Creating Layer norm1
-I0906 13:34:09.319968  8300 net.cpp:411] norm1 <- conv1
-I0906 13:34:09.319980  8300 net.cpp:369] norm1 -> norm1
-I0906 13:34:09.319991  8300 net.cpp:121] Setting up norm1
-I0906 13:34:09.320009  8300 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:09.320053  8300 net.cpp:134] Memory required for data: 205158000
-I0906 13:34:09.320060  8300 layer_factory.hpp:74] Creating layer pool1
-I0906 13:34:09.320075  8300 net.cpp:91] Creating Layer pool1
-I0906 13:34:09.320081  8300 net.cpp:411] pool1 <- norm1
-I0906 13:34:09.320093  8300 net.cpp:369] pool1 -> pool1
-I0906 13:34:09.320103  8300 net.cpp:121] Setting up pool1
-I0906 13:34:09.320122  8300 net.cpp:128] Top shape: 50 96 27 27 (3499200)
-I0906 13:34:09.320125  8300 net.cpp:134] Memory required for data: 219154800
-I0906 13:34:09.320130  8300 layer_factory.hpp:74] Creating layer conv2
-I0906 13:34:09.320143  8300 net.cpp:91] Creating Layer conv2
-I0906 13:34:09.320149  8300 net.cpp:411] conv2 <- pool1
-I0906 13:34:09.320163  8300 net.cpp:369] conv2 -> conv2
-I0906 13:34:09.320174  8300 net.cpp:121] Setting up conv2
-I0906 13:34:09.359275  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:09.359290  8300 net.cpp:134] Memory required for data: 256479600
-I0906 13:34:09.359316  8300 layer_factory.hpp:74] Creating layer relu2
-I0906 13:34:09.359336  8300 net.cpp:91] Creating Layer relu2
-I0906 13:34:09.359346  8300 net.cpp:411] relu2 <- conv2
-I0906 13:34:09.359365  8300 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:34:09.359395  8300 net.cpp:121] Setting up relu2
-I0906 13:34:09.359403  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:09.359407  8300 net.cpp:134] Memory required for data: 293804400
-I0906 13:34:09.359412  8300 layer_factory.hpp:74] Creating layer norm2
-I0906 13:34:09.359433  8300 net.cpp:91] Creating Layer norm2
-I0906 13:34:09.359438  8300 net.cpp:411] norm2 <- conv2
-I0906 13:34:09.359452  8300 net.cpp:369] norm2 -> norm2
-I0906 13:34:09.359467  8300 net.cpp:121] Setting up norm2
-I0906 13:34:09.359486  8300 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:09.359490  8300 net.cpp:134] Memory required for data: 331129200
-I0906 13:34:09.359495  8300 layer_factory.hpp:74] Creating layer pool2
-I0906 13:34:09.359508  8300 net.cpp:91] Creating Layer pool2
-I0906 13:34:09.359514  8300 net.cpp:411] pool2 <- norm2
-I0906 13:34:09.359526  8300 net.cpp:369] pool2 -> pool2
-I0906 13:34:09.359537  8300 net.cpp:121] Setting up pool2
-I0906 13:34:09.359555  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:09.359558  8300 net.cpp:134] Memory required for data: 339782000
-I0906 13:34:09.359563  8300 layer_factory.hpp:74] Creating layer conv3
-I0906 13:34:09.359581  8300 net.cpp:91] Creating Layer conv3
-I0906 13:34:09.359587  8300 net.cpp:411] conv3 <- pool2
-I0906 13:34:09.359601  8300 net.cpp:369] conv3 -> conv3
-I0906 13:34:09.359613  8300 net.cpp:121] Setting up conv3
-I0906 13:34:09.410833  8305 data_layer.cpp:120] Prefetch batch: 95 ms.
-I0906 13:34:09.410863  8305 data_layer.cpp:121]      Read time: 11.984 ms.
-I0906 13:34:09.410871  8305 data_layer.cpp:122] Transform time: 82.885 ms.
-I0906 13:34:09.474556  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:09.474578  8300 net.cpp:134] Memory required for data: 352761200
-I0906 13:34:09.474618  8300 layer_factory.hpp:74] Creating layer relu3
-I0906 13:34:09.474648  8300 net.cpp:91] Creating Layer relu3
-I0906 13:34:09.474663  8300 net.cpp:411] relu3 <- conv3
-I0906 13:34:09.474689  8300 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:34:09.474704  8300 net.cpp:121] Setting up relu3
-I0906 13:34:09.474714  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:09.474717  8300 net.cpp:134] Memory required for data: 365740400
-I0906 13:34:09.474721  8300 layer_factory.hpp:74] Creating layer conv4
-I0906 13:34:09.474745  8300 net.cpp:91] Creating Layer conv4
-I0906 13:34:09.474751  8300 net.cpp:411] conv4 <- conv3
-I0906 13:34:09.474766  8300 net.cpp:369] conv4 -> conv4
-I0906 13:34:09.474781  8300 net.cpp:121] Setting up conv4
-I0906 13:34:09.562909  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:09.562930  8300 net.cpp:134] Memory required for data: 378719600
-I0906 13:34:09.562957  8300 layer_factory.hpp:74] Creating layer relu4
-I0906 13:34:09.562988  8300 net.cpp:91] Creating Layer relu4
-I0906 13:34:09.563051  8300 net.cpp:411] relu4 <- conv4
-I0906 13:34:09.563086  8300 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:34:09.563102  8300 net.cpp:121] Setting up relu4
-I0906 13:34:09.563112  8300 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:09.563117  8300 net.cpp:134] Memory required for data: 391698800
-I0906 13:34:09.563122  8300 layer_factory.hpp:74] Creating layer conv5
-I0906 13:34:09.563146  8300 net.cpp:91] Creating Layer conv5
-I0906 13:34:09.563153  8300 net.cpp:411] conv5 <- conv4
-I0906 13:34:09.563168  8300 net.cpp:369] conv5 -> conv5
-I0906 13:34:09.563182  8300 net.cpp:121] Setting up conv5
-I0906 13:34:09.619202  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:09.619220  8300 net.cpp:134] Memory required for data: 400351600
-I0906 13:34:09.619256  8300 layer_factory.hpp:74] Creating layer relu5
-I0906 13:34:09.619284  8300 net.cpp:91] Creating Layer relu5
-I0906 13:34:09.619298  8300 net.cpp:411] relu5 <- conv5
-I0906 13:34:09.619321  8300 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:34:09.619336  8300 net.cpp:121] Setting up relu5
-I0906 13:34:09.619344  8300 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:09.619349  8300 net.cpp:134] Memory required for data: 409004400
-I0906 13:34:09.619354  8300 layer_factory.hpp:74] Creating layer pool5
-I0906 13:34:09.619380  8300 net.cpp:91] Creating Layer pool5
-I0906 13:34:09.619386  8300 net.cpp:411] pool5 <- conv5
-I0906 13:34:09.619398  8300 net.cpp:369] pool5 -> pool5
-I0906 13:34:09.619411  8300 net.cpp:121] Setting up pool5
-I0906 13:34:09.619431  8300 net.cpp:128] Top shape: 50 256 6 6 (460800)
-I0906 13:34:09.619434  8300 net.cpp:134] Memory required for data: 410847600
-I0906 13:34:09.619439  8300 layer_factory.hpp:74] Creating layer fc6
-I0906 13:34:09.619457  8300 net.cpp:91] Creating Layer fc6
-I0906 13:34:09.619463  8300 net.cpp:411] fc6 <- pool5
-I0906 13:34:09.619477  8300 net.cpp:369] fc6 -> fc6
-I0906 13:34:09.619488  8300 net.cpp:121] Setting up fc6
-I0906 13:34:15.320122  8300 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:15.320147  8300 net.cpp:134] Memory required for data: 411666800
-I0906 13:34:15.320174  8300 layer_factory.hpp:74] Creating layer relu6
-I0906 13:34:15.320206  8300 net.cpp:91] Creating Layer relu6
-I0906 13:34:15.320222  8300 net.cpp:411] relu6 <- fc6
-I0906 13:34:15.320248  8300 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:34:15.320263  8300 net.cpp:121] Setting up relu6
-I0906 13:34:15.320272  8300 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:15.320276  8300 net.cpp:134] Memory required for data: 412486000
-I0906 13:34:15.320281  8300 layer_factory.hpp:74] Creating layer fc7
-I0906 13:34:15.320302  8300 net.cpp:91] Creating Layer fc7
-I0906 13:34:15.320308  8300 net.cpp:411] fc7 <- fc6
-I0906 13:34:15.320322  8300 net.cpp:369] fc7 -> fc7
-I0906 13:34:15.320338  8300 net.cpp:121] Setting up fc7
-I0906 13:34:17.700968  8300 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:17.700994  8300 net.cpp:134] Memory required for data: 413305200
-I0906 13:34:17.701020  8300 layer_factory.hpp:74] Creating layer relu7
-I0906 13:34:17.701052  8300 net.cpp:91] Creating Layer relu7
-I0906 13:34:17.701067  8300 net.cpp:411] relu7 <- fc7
-I0906 13:34:17.701093  8300 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:34:17.701109  8300 net.cpp:121] Setting up relu7
-I0906 13:34:17.701117  8300 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:17.701122  8300 net.cpp:134] Memory required for data: 414124400
-I0906 13:34:17.701125  8300 layer_factory.hpp:74] Creating layer fc8
-I0906 13:34:17.701146  8300 net.cpp:91] Creating Layer fc8
-I0906 13:34:17.701153  8300 net.cpp:411] fc8 <- fc7
-I0906 13:34:17.701166  8300 net.cpp:369] fc8 -> fc8
-I0906 13:34:17.701191  8300 net.cpp:121] Setting up fc8
-I0906 13:34:18.224659  8300 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:18.224681  8300 net.cpp:134] Memory required for data: 414324400
-I0906 13:34:18.224707  8300 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
-I0906 13:34:18.224737  8300 net.cpp:91] Creating Layer fc8_fc8_0_split
-I0906 13:34:18.224798  8300 net.cpp:411] fc8_fc8_0_split <- fc8
-I0906 13:34:18.224828  8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
-I0906 13:34:18.224848  8300 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
-I0906 13:34:18.224860  8300 net.cpp:121] Setting up fc8_fc8_0_split
-I0906 13:34:18.224876  8300 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:18.224882  8300 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:18.224886  8300 net.cpp:134] Memory required for data: 414724400
-I0906 13:34:18.224891  8300 layer_factory.hpp:74] Creating layer accuracy
-I0906 13:34:18.224922  8300 net.cpp:91] Creating Layer accuracy
-I0906 13:34:18.224927  8300 net.cpp:411] accuracy <- fc8_fc8_0_split_0
-I0906 13:34:18.224938  8300 net.cpp:411] accuracy <- label_data_1_split_0
-I0906 13:34:18.224949  8300 net.cpp:369] accuracy -> accuracy
-I0906 13:34:18.224961  8300 net.cpp:121] Setting up accuracy
-I0906 13:34:18.224977  8300 net.cpp:128] Top shape: (1)
-I0906 13:34:18.224980  8300 net.cpp:134] Memory required for data: 414724404
-I0906 13:34:18.224985  8300 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:18.224997  8300 net.cpp:91] Creating Layer loss
-I0906 13:34:18.225003  8300 net.cpp:411] loss <- fc8_fc8_0_split_1
-I0906 13:34:18.225013  8300 net.cpp:411] loss <- label_data_1_split_1
-I0906 13:34:18.225023  8300 net.cpp:369] loss -> loss
-I0906 13:34:18.225033  8300 net.cpp:121] Setting up loss
-I0906 13:34:18.225044  8300 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:18.225343  8300 net.cpp:128] Top shape: (1)
-I0906 13:34:18.225348  8300 net.cpp:130]     with loss weight 1
-I0906 13:34:18.225364  8300 net.cpp:134] Memory required for data: 414724408
-I0906 13:34:18.225371  8300 net.cpp:193] loss needs backward computation.
-I0906 13:34:18.225378  8300 net.cpp:195] accuracy does not need backward computation.
-I0906 13:34:18.225386  8300 net.cpp:193] fc8_fc8_0_split needs backward computation.
-I0906 13:34:18.225391  8300 net.cpp:193] fc8 needs backward computation.
-I0906 13:34:18.225397  8300 net.cpp:193] relu7 needs backward computation.
-I0906 13:34:18.225404  8300 net.cpp:193] fc7 needs backward computation.
-I0906 13:34:18.225409  8300 net.cpp:193] relu6 needs backward computation.
-I0906 13:34:18.225414  8300 net.cpp:193] fc6 needs backward computation.
-I0906 13:34:18.225420  8300 net.cpp:193] pool5 needs backward computation.
-I0906 13:34:18.225426  8300 net.cpp:193] relu5 needs backward computation.
-I0906 13:34:18.225431  8300 net.cpp:193] conv5 needs backward computation.
-I0906 13:34:18.225438  8300 net.cpp:193] relu4 needs backward computation.
-I0906 13:34:18.225443  8300 net.cpp:193] conv4 needs backward computation.
-I0906 13:34:18.225450  8300 net.cpp:193] relu3 needs backward computation.
-I0906 13:34:18.225455  8300 net.cpp:193] conv3 needs backward computation.
-I0906 13:34:18.225461  8300 net.cpp:193] pool2 needs backward computation.
-I0906 13:34:18.225466  8300 net.cpp:193] norm2 needs backward computation.
-I0906 13:34:18.225472  8300 net.cpp:193] relu2 needs backward computation.
-I0906 13:34:18.225477  8300 net.cpp:193] conv2 needs backward computation.
-I0906 13:34:18.225484  8300 net.cpp:193] pool1 needs backward computation.
-I0906 13:34:18.225491  8300 net.cpp:193] norm1 needs backward computation.
-I0906 13:34:18.225496  8300 net.cpp:193] relu1 needs backward computation.
-I0906 13:34:18.225502  8300 net.cpp:193] conv1 needs backward computation.
-I0906 13:34:18.225508  8300 net.cpp:195] label_data_1_split does not need backward computation.
-I0906 13:34:18.225515  8300 net.cpp:195] data does not need backward computation.
-I0906 13:34:18.225520  8300 net.cpp:236] This network produces output accuracy
-I0906 13:34:18.225527  8300 net.cpp:236] This network produces output loss
-I0906 13:34:18.225561  8300 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:34:18.225574  8300 net.cpp:248] Network initialization done.
-I0906 13:34:18.225579  8300 net.cpp:249] Memory required for data: 414724408
-I0906 13:34:18.225764  8300 solver.cpp:53] Solver scaffolding done.
-I0906 13:34:18.225879  8300 solver.cpp:270] Solving AlexNet
-I0906 13:34:18.225898  8300 solver.cpp:271] Learning Rate Policy: step
-I0906 13:34:18.227551  8300 solver.cpp:314] Iteration 0, Testing net (#0)
-I0906 13:34:18.227571  8300 net.cpp:696] Copying source layer data
-I0906 13:34:18.227577  8300 net.cpp:696] Copying source layer conv1
-I0906 13:34:18.230358  8300 net.cpp:696] Copying source layer relu1
-I0906 13:34:18.230398  8300 net.cpp:696] Copying source layer norm1
-I0906 13:34:18.230409  8300 net.cpp:696] Copying source layer pool1
-I0906 13:34:18.230419  8300 net.cpp:696] Copying source layer conv2
-I0906 13:34:18.230605  8300 net.cpp:696] Copying source layer relu2
-I0906 13:34:18.230624  8300 net.cpp:696] Copying source layer norm2
-I0906 13:34:18.230634  8300 net.cpp:696] Copying source layer pool2
-I0906 13:34:18.230644  8300 net.cpp:696] Copying source layer conv3
-I0906 13:34:18.231482  8300 net.cpp:696] Copying source layer relu3
-I0906 13:34:18.231510  8300 net.cpp:696] Copying source layer conv4
-I0906 13:34:18.232178  8300 net.cpp:696] Copying source layer relu4
-I0906 13:34:18.232195  8300 net.cpp:696] Copying source layer conv5
-I0906 13:34:18.232681  8300 net.cpp:696] Copying source layer relu5
-I0906 13:34:18.232697  8300 net.cpp:696] Copying source layer pool5
-I0906 13:34:18.232708  8300 net.cpp:696] Copying source layer fc6
-I0906 13:34:18.250728  8300 net.cpp:696] Copying source layer relu6
-I0906 13:34:18.250753  8300 net.cpp:696] Copying source layer fc7
-I0906 13:34:18.257216  8300 net.cpp:696] Copying source layer relu7
-I0906 13:34:18.257241  8300 net.cpp:696] Copying source layer fc8
-I0906 13:34:18.258977  8300 net.cpp:696] Copying source layer loss
-I0906 13:34:18.259091  8300 base_data_layer.cpp:89] Thread joined
-I0906 13:34:18.263509  8300 base_data_layer.cpp:93] Prefetch copied
-I0906 13:34:18.263875  8300 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:34:18.362475  8306 data_layer.cpp:120] Prefetch batch: 98 ms.
-I0906 13:34:18.362507  8306 data_layer.cpp:121]      Read time: 12.694 ms.
-I0906 13:34:18.362515  8306 data_layer.cpp:122] Transform time: 84.611 ms.
-I0906 13:34:21.291707  8300 solver.cpp:363]     Test net output #0: accuracy = 0
-I0906 13:34:21.291733  8300 solver.cpp:363]     Test net output #1: loss = 6.91228 (* 1 = 6.91228 loss)
-I0906 13:34:21.291775  8300 base_data_layer.cpp:89] Thread joined
-I0906 13:34:21.300678  8300 base_data_layer.cpp:93] Prefetch copied
-I0906 13:34:21.301050  8300 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:34:21.491194  8310 data_layer.cpp:120] Prefetch batch: 189 ms.
-I0906 13:34:21.491225  8310 data_layer.cpp:121]      Read time: 24.533 ms.
-I0906 13:34:21.491231  8310 data_layer.cpp:122] Transform time: 163.65 ms.
-I0906 13:34:28.088075  8300 solver.cpp:234] Iteration 0, loss = 0
-I0906 13:34:28.088134  8300 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
-I0906 13:34:28.088184  8300 solver.cpp:506] Iteration 0, lr = 0.01
-I0906 13:34:28.203598  8300 base_data_layer.cpp:89] Thread joined
-I0906 13:34:28.212023  8300 base_data_layer.cpp:93] Prefetch copied
-I0906 13:34:28.212162  8300 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:34:28.397155  8312 data_layer.cpp:120] Prefetch batch: 184 ms.
-I0906 13:34:28.397193  8312 data_layer.cpp:121]      Read time: 23.16 ms.
-I0906 13:34:28.397200  8312 data_layer.cpp:122] Transform time: 159.902 ms.
-I0906 13:34:30.978493  8300 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
deleted file mode 100644
index 93afd4cf..00000000
--- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-133437.8316
+++ /dev/null
@@ -1,1208 +0,0 @@
-Log file created at: 2015/09/06 13:34:37
-Running on machine: AMD-RESEARCH
-Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
-I0906 13:34:37.585557  8316 caffe.cpp:114] Use GPU with device ID 0
-I0906 13:34:37.621670  8316 device.cpp:230] Number of platforms found:1
-I0906 13:34:37.621708  8316 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
-I0906 13:34:37.621721  8316 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
-I0906 13:34:37.621724  8316 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
-I0906 13:34:37.621728  8316 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
-I0906 13:34:37.621732  8316 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
-I0906 13:34:37.621739  8316 device.cpp:286] Number of devices found:1
-I0906 13:34:37.621743  8316 device.cpp:288] 	DeviceID:	0x22ed250
-I0906 13:34:37.621760  8316 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
-I0906 13:34:37.621767  8316 device.cpp:393] 	Is it integrated GPU?:	0
-I0906 13:34:37.621772  8316 device.cpp:393] 	Max clock frequency MHz:	930
-I0906 13:34:37.621775  8316 device.cpp:393] 	Host-Device unified mem:	0
-I0906 13:34:37.621779  8316 device.cpp:393] 	ECC support:	0
-I0906 13:34:37.621783  8316 device.cpp:393] 	Endian little:	1
-I0906 13:34:37.621788  8316 device.cpp:393] 	Max compute units:	44
-I0906 13:34:37.621791  8316 device.cpp:393] 	Max work group size:	256
-I0906 13:34:37.621796  8316 device.cpp:393] 	Max work item dimensions:	3
-I0906 13:34:37.621801  8316 device.cpp:393] 	Max work item sizes:	0x100
-I0906 13:34:37.621806  8316 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
-I0906 13:34:37.621811  8316 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
-I0906 13:34:37.621815  8316 device.cpp:393] 	Max mem alloc size:	4244635648
-I0906 13:34:37.621819  8316 device.cpp:393] 	Global mem size:	16878927872
-I0906 13:34:37.621822  8316 device.cpp:393] 	Local mem size:	32768
-I0906 13:34:37.621830  8316 device.cpp:96] Picked device type : GPU 0
-I0906 13:34:40.036291  8316 device.cpp:152] Build Program
-I0906 13:34:40.036520  8316 caffe.cpp:122] Starting Optimization
-I0906 13:34:40.036612  8316 solver.cpp:40] Initializing solver from parameters: 
-test_iter: 1
-test_interval: 1000
-base_lr: 0.01
-display: 1
-max_iter: 10
-lr_policy: "step"
-gamma: 0.1
-momentum: 0.9
-weight_decay: 0.0005
-stepsize: 100000
-snapshot: 10000
-snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
-solver_mode: GPU
-net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
-I0906 13:34:40.036731  8316 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:34:40.037874  8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
-I0906 13:34:40.037925  8316 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
-I0906 13:34:40.038099  8316 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TRAIN
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:34:40.038537  8316 net.cpp:68] Memory required for data: 0
-I0906 13:34:40.038749  8316 layer_factory.hpp:74] Creating layer data
-I0906 13:34:40.038802  8316 net.cpp:91] Creating Layer data
-I0906 13:34:40.038825  8316 net.cpp:369] data -> data
-I0906 13:34:40.038928  8316 net.cpp:369] data -> label
-I0906 13:34:40.038950  8316 net.cpp:121] Setting up data
-I0906 13:34:40.038962  8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:34:40.048738  8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
-I0906 13:34:40.049080  8316 data_layer.cpp:53] output data size: 100,3,227,227
-I0906 13:34:40.081225  8316 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:34:40.081426  8316 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:34:40.081490  8316 net.cpp:128] Top shape: 100 3 227 227 (15458700)
-I0906 13:34:40.081500  8316 net.cpp:128] Top shape: 100 (100)
-I0906 13:34:40.081504  8316 net.cpp:134] Memory required for data: 61835200
-I0906 13:34:40.081537  8316 layer_factory.hpp:74] Creating layer conv1
-I0906 13:34:40.081619  8316 net.cpp:91] Creating Layer conv1
-I0906 13:34:40.081641  8316 net.cpp:411] conv1 <- data
-I0906 13:34:40.081694  8316 net.cpp:369] conv1 -> conv1
-I0906 13:34:40.081758  8316 net.cpp:121] Setting up conv1
-I0906 13:34:40.088135  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:40.088160  8316 net.cpp:134] Memory required for data: 177995200
-I0906 13:34:40.088239  8316 layer_factory.hpp:74] Creating layer relu1
-I0906 13:34:40.088297  8316 net.cpp:91] Creating Layer relu1
-I0906 13:34:40.088315  8316 net.cpp:411] relu1 <- conv1
-I0906 13:34:40.088351  8316 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:34:40.088372  8316 net.cpp:121] Setting up relu1
-I0906 13:34:40.088385  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:40.088390  8316 net.cpp:134] Memory required for data: 294155200
-I0906 13:34:40.088397  8316 layer_factory.hpp:74] Creating layer norm1
-I0906 13:34:40.088435  8316 net.cpp:91] Creating Layer norm1
-I0906 13:34:40.088444  8316 net.cpp:411] norm1 <- conv1
-I0906 13:34:40.088466  8316 net.cpp:369] norm1 -> norm1
-I0906 13:34:40.088486  8316 net.cpp:121] Setting up norm1
-I0906 13:34:40.088531  8316 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:34:40.088537  8316 net.cpp:134] Memory required for data: 410315200
-I0906 13:34:40.088543  8316 layer_factory.hpp:74] Creating layer pool1
-I0906 13:34:40.088580  8316 net.cpp:91] Creating Layer pool1
-I0906 13:34:40.088590  8316 net.cpp:411] pool1 <- norm1
-I0906 13:34:40.088613  8316 net.cpp:369] pool1 -> pool1
-I0906 13:34:40.088637  8316 net.cpp:121] Setting up pool1
-I0906 13:34:40.088686  8316 net.cpp:128] Top shape: 100 96 27 27 (6998400)
-I0906 13:34:40.088691  8316 net.cpp:134] Memory required for data: 438308800
-I0906 13:34:40.088701  8316 layer_factory.hpp:74] Creating layer conv2
-I0906 13:34:40.088739  8316 net.cpp:91] Creating Layer conv2
-I0906 13:34:40.088750  8316 net.cpp:411] conv2 <- pool1
-I0906 13:34:40.088783  8316 net.cpp:369] conv2 -> conv2
-I0906 13:34:40.088804  8316 net.cpp:121] Setting up conv2
-I0906 13:34:40.129534  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:40.129550  8316 net.cpp:134] Memory required for data: 512958400
-I0906 13:34:40.129585  8316 layer_factory.hpp:74] Creating layer relu2
-I0906 13:34:40.129613  8316 net.cpp:91] Creating Layer relu2
-I0906 13:34:40.129624  8316 net.cpp:411] relu2 <- conv2
-I0906 13:34:40.129647  8316 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:34:40.129662  8316 net.cpp:121] Setting up relu2
-I0906 13:34:40.129670  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:40.129674  8316 net.cpp:134] Memory required for data: 587608000
-I0906 13:34:40.129679  8316 layer_factory.hpp:74] Creating layer norm2
-I0906 13:34:40.129698  8316 net.cpp:91] Creating Layer norm2
-I0906 13:34:40.129703  8316 net.cpp:411] norm2 <- conv2
-I0906 13:34:40.129717  8316 net.cpp:369] norm2 -> norm2
-I0906 13:34:40.129730  8316 net.cpp:121] Setting up norm2
-I0906 13:34:40.129750  8316 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:34:40.129755  8316 net.cpp:134] Memory required for data: 662257600
-I0906 13:34:40.129760  8316 layer_factory.hpp:74] Creating layer pool2
-I0906 13:34:40.129783  8316 net.cpp:91] Creating Layer pool2
-I0906 13:34:40.129789  8316 net.cpp:411] pool2 <- norm2
-I0906 13:34:40.129802  8316 net.cpp:369] pool2 -> pool2
-I0906 13:34:40.129813  8316 net.cpp:121] Setting up pool2
-I0906 13:34:40.129832  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:40.129837  8316 net.cpp:134] Memory required for data: 679563200
-I0906 13:34:40.129887  8316 layer_factory.hpp:74] Creating layer conv3
-I0906 13:34:40.129910  8316 net.cpp:91] Creating Layer conv3
-I0906 13:34:40.129916  8316 net.cpp:411] conv3 <- pool2
-I0906 13:34:40.129933  8316 net.cpp:369] conv3 -> conv3
-I0906 13:34:40.129948  8316 net.cpp:121] Setting up conv3
-I0906 13:34:40.246141  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:40.246165  8316 net.cpp:134] Memory required for data: 705521600
-I0906 13:34:40.246211  8316 layer_factory.hpp:74] Creating layer relu3
-I0906 13:34:40.246247  8316 net.cpp:91] Creating Layer relu3
-I0906 13:34:40.246261  8316 net.cpp:411] relu3 <- conv3
-I0906 13:34:40.246287  8316 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:34:40.246304  8316 net.cpp:121] Setting up relu3
-I0906 13:34:40.246314  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:40.246317  8316 net.cpp:134] Memory required for data: 731480000
-I0906 13:34:40.246322  8316 layer_factory.hpp:74] Creating layer conv4
-I0906 13:34:40.246351  8316 net.cpp:91] Creating Layer conv4
-I0906 13:34:40.246356  8316 net.cpp:411] conv4 <- conv3
-I0906 13:34:40.246372  8316 net.cpp:369] conv4 -> conv4
-I0906 13:34:40.246387  8316 net.cpp:121] Setting up conv4
-I0906 13:34:40.273671  8320 data_layer.cpp:120] Prefetch batch: 191 ms.
-I0906 13:34:40.273718  8320 data_layer.cpp:121]      Read time: 24.494 ms.
-I0906 13:34:40.273727  8320 data_layer.cpp:122] Transform time: 165.29 ms.
-I0906 13:34:40.332166  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:40.332187  8316 net.cpp:134] Memory required for data: 757438400
-I0906 13:34:40.332214  8316 layer_factory.hpp:74] Creating layer relu4
-I0906 13:34:40.332247  8316 net.cpp:91] Creating Layer relu4
-I0906 13:34:40.332262  8316 net.cpp:411] relu4 <- conv4
-I0906 13:34:40.332288  8316 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:34:40.332304  8316 net.cpp:121] Setting up relu4
-I0906 13:34:40.332314  8316 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:34:40.332317  8316 net.cpp:134] Memory required for data: 783396800
-I0906 13:34:40.332321  8316 layer_factory.hpp:74] Creating layer conv5
-I0906 13:34:40.332350  8316 net.cpp:91] Creating Layer conv5
-I0906 13:34:40.332355  8316 net.cpp:411] conv5 <- conv4
-I0906 13:34:40.332371  8316 net.cpp:369] conv5 -> conv5
-I0906 13:34:40.332386  8316 net.cpp:121] Setting up conv5
-I0906 13:34:40.388872  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:40.388891  8316 net.cpp:134] Memory required for data: 800702400
-I0906 13:34:40.388931  8316 layer_factory.hpp:74] Creating layer relu5
-I0906 13:34:40.388959  8316 net.cpp:91] Creating Layer relu5
-I0906 13:34:40.388972  8316 net.cpp:411] relu5 <- conv5
-I0906 13:34:40.388995  8316 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:34:40.389010  8316 net.cpp:121] Setting up relu5
-I0906 13:34:40.389019  8316 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:34:40.389024  8316 net.cpp:134] Memory required for data: 818008000
-I0906 13:34:40.389029  8316 layer_factory.hpp:74] Creating layer pool5
-I0906 13:34:40.389049  8316 net.cpp:91] Creating Layer pool5
-I0906 13:34:40.389053  8316 net.cpp:411] pool5 <- conv5
-I0906 13:34:40.389067  8316 net.cpp:369] pool5 -> pool5
-I0906 13:34:40.389081  8316 net.cpp:121] Setting up pool5
-I0906 13:34:40.389102  8316 net.cpp:128] Top shape: 100 256 6 6 (921600)
-I0906 13:34:40.389107  8316 net.cpp:134] Memory required for data: 821694400
-I0906 13:34:40.389112  8316 layer_factory.hpp:74] Creating layer fc6
-I0906 13:34:40.389147  8316 net.cpp:91] Creating Layer fc6
-I0906 13:34:40.389153  8316 net.cpp:411] fc6 <- pool5
-I0906 13:34:40.389169  8316 net.cpp:369] fc6 -> fc6
-I0906 13:34:40.389183  8316 net.cpp:121] Setting up fc6
-I0906 13:34:45.208031  8316 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:45.208055  8316 net.cpp:134] Memory required for data: 823332800
-I0906 13:34:45.208081  8316 layer_factory.hpp:74] Creating layer relu6
-I0906 13:34:45.208112  8316 net.cpp:91] Creating Layer relu6
-I0906 13:34:45.208128  8316 net.cpp:411] relu6 <- fc6
-I0906 13:34:45.208154  8316 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:34:45.208210  8316 net.cpp:121] Setting up relu6
-I0906 13:34:45.208220  8316 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:45.208223  8316 net.cpp:134] Memory required for data: 824971200
-I0906 13:34:45.208228  8316 layer_factory.hpp:74] Creating layer fc7
-I0906 13:34:45.208250  8316 net.cpp:91] Creating Layer fc7
-I0906 13:34:45.208256  8316 net.cpp:411] fc7 <- fc6
-I0906 13:34:45.208273  8316 net.cpp:369] fc7 -> fc7
-I0906 13:34:45.208288  8316 net.cpp:121] Setting up fc7
-I0906 13:34:47.352208  8316 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:47.352234  8316 net.cpp:134] Memory required for data: 826609600
-I0906 13:34:47.352262  8316 layer_factory.hpp:74] Creating layer relu7
-I0906 13:34:47.352295  8316 net.cpp:91] Creating Layer relu7
-I0906 13:34:47.352311  8316 net.cpp:411] relu7 <- fc7
-I0906 13:34:47.352339  8316 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:34:47.352355  8316 net.cpp:121] Setting up relu7
-I0906 13:34:47.352363  8316 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:34:47.352368  8316 net.cpp:134] Memory required for data: 828248000
-I0906 13:34:47.352373  8316 layer_factory.hpp:74] Creating layer fc8
-I0906 13:34:47.352396  8316 net.cpp:91] Creating Layer fc8
-I0906 13:34:47.352402  8316 net.cpp:411] fc8 <- fc7
-I0906 13:34:47.352418  8316 net.cpp:369] fc8 -> fc8
-I0906 13:34:47.352433  8316 net.cpp:121] Setting up fc8
-I0906 13:34:47.878074  8316 net.cpp:128] Top shape: 100 1000 (100000)
-I0906 13:34:47.878098  8316 net.cpp:134] Memory required for data: 828648000
-I0906 13:34:47.878126  8316 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:47.878178  8316 net.cpp:91] Creating Layer loss
-I0906 13:34:47.878195  8316 net.cpp:411] loss <- fc8
-I0906 13:34:47.878217  8316 net.cpp:411] loss <- label
-I0906 13:34:47.878237  8316 net.cpp:369] loss -> loss
-I0906 13:34:47.878255  8316 net.cpp:121] Setting up loss
-I0906 13:34:47.878273  8316 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:47.878825  8316 net.cpp:128] Top shape: (1)
-I0906 13:34:47.878831  8316 net.cpp:130]     with loss weight 1
-I0906 13:34:47.878847  8316 net.cpp:134] Memory required for data: 828648004
-I0906 13:34:47.878856  8316 net.cpp:193] loss needs backward computation.
-I0906 13:34:47.878865  8316 net.cpp:193] fc8 needs backward computation.
-I0906 13:34:47.878870  8316 net.cpp:193] relu7 needs backward computation.
-I0906 13:34:47.878876  8316 net.cpp:193] fc7 needs backward computation.
-I0906 13:34:47.878882  8316 net.cpp:193] relu6 needs backward computation.
-I0906 13:34:47.878888  8316 net.cpp:193] fc6 needs backward computation.
-I0906 13:34:47.878895  8316 net.cpp:193] pool5 needs backward computation.
-I0906 13:34:47.878901  8316 net.cpp:193] relu5 needs backward computation.
-I0906 13:34:47.878906  8316 net.cpp:193] conv5 needs backward computation.
-I0906 13:34:47.878911  8316 net.cpp:193] relu4 needs backward computation.
-I0906 13:34:47.878917  8316 net.cpp:193] conv4 needs backward computation.
-I0906 13:34:47.878923  8316 net.cpp:193] relu3 needs backward computation.
-I0906 13:34:47.878928  8316 net.cpp:193] conv3 needs backward computation.
-I0906 13:34:47.878936  8316 net.cpp:193] pool2 needs backward computation.
-I0906 13:34:47.878942  8316 net.cpp:193] norm2 needs backward computation.
-I0906 13:34:47.878948  8316 net.cpp:193] relu2 needs backward computation.
-I0906 13:34:47.878953  8316 net.cpp:193] conv2 needs backward computation.
-I0906 13:34:47.878959  8316 net.cpp:193] pool1 needs backward computation.
-I0906 13:34:47.878965  8316 net.cpp:193] norm1 needs backward computation.
-I0906 13:34:47.878972  8316 net.cpp:193] relu1 needs backward computation.
-I0906 13:34:47.878978  8316 net.cpp:193] conv1 needs backward computation.
-I0906 13:34:47.878984  8316 net.cpp:195] data does not need backward computation.
-I0906 13:34:47.878993  8316 net.cpp:236] This network produces output loss
-I0906 13:34:47.879026  8316 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:34:47.879042  8316 net.cpp:248] Network initialization done.
-I0906 13:34:47.879045  8316 net.cpp:249] Memory required for data: 828648004
-I0906 13:34:47.880003  8316 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:34:47.880131  8316 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
-I0906 13:34:47.880362  8316 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TEST
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "fc8"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:34:47.880718  8316 net.cpp:68] Memory required for data: 0
-I0906 13:34:47.880764  8316 layer_factory.hpp:74] Creating layer data
-I0906 13:34:47.880786  8316 net.cpp:91] Creating Layer data
-I0906 13:34:47.880797  8316 net.cpp:369] data -> data
-I0906 13:34:47.880820  8316 net.cpp:369] data -> label
-I0906 13:34:47.880832  8316 net.cpp:121] Setting up data
-I0906 13:34:47.880839  8316 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:34:47.890487  8316 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
-I0906 13:34:47.890738  8316 data_layer.cpp:53] output data size: 50,3,227,227
-I0906 13:34:47.907624  8316 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:34:47.907733  8316 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:34:47.907762  8316 net.cpp:128] Top shape: 50 3 227 227 (7729350)
-I0906 13:34:47.907769  8316 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:47.907773  8316 net.cpp:134] Memory required for data: 30917600
-I0906 13:34:47.907805  8316 layer_factory.hpp:74] Creating layer label_data_1_split
-I0906 13:34:47.907896  8316 net.cpp:91] Creating Layer label_data_1_split
-I0906 13:34:47.907917  8316 net.cpp:411] label_data_1_split <- label
-I0906 13:34:47.907979  8316 net.cpp:369] label_data_1_split -> label_data_1_split_0
-I0906 13:34:47.908016  8316 net.cpp:369] label_data_1_split -> label_data_1_split_1
-I0906 13:34:47.908028  8316 net.cpp:121] Setting up label_data_1_split
-I0906 13:34:47.908057  8316 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:47.908064  8316 net.cpp:128] Top shape: 50 (50)
-I0906 13:34:47.908068  8316 net.cpp:134] Memory required for data: 30918000
-I0906 13:34:47.908073  8316 layer_factory.hpp:74] Creating layer conv1
-I0906 13:34:47.908112  8316 net.cpp:91] Creating Layer conv1
-I0906 13:34:47.908118  8316 net.cpp:411] conv1 <- data
-I0906 13:34:47.908133  8316 net.cpp:369] conv1 -> conv1
-I0906 13:34:47.908148  8316 net.cpp:121] Setting up conv1
-I0906 13:34:47.912806  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:47.912811  8316 net.cpp:134] Memory required for data: 88998000
-I0906 13:34:47.912832  8316 layer_factory.hpp:74] Creating layer relu1
-I0906 13:34:47.912844  8316 net.cpp:91] Creating Layer relu1
-I0906 13:34:47.912850  8316 net.cpp:411] relu1 <- conv1
-I0906 13:34:47.912863  8316 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:34:47.912873  8316 net.cpp:121] Setting up relu1
-I0906 13:34:47.912880  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:47.912883  8316 net.cpp:134] Memory required for data: 147078000
-I0906 13:34:47.912889  8316 layer_factory.hpp:74] Creating layer norm1
-I0906 13:34:47.912907  8316 net.cpp:91] Creating Layer norm1
-I0906 13:34:47.912912  8316 net.cpp:411] norm1 <- conv1
-I0906 13:34:47.912925  8316 net.cpp:369] norm1 -> norm1
-I0906 13:34:47.912936  8316 net.cpp:121] Setting up norm1
-I0906 13:34:47.912955  8316 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:34:47.912999  8316 net.cpp:134] Memory required for data: 205158000
-I0906 13:34:47.913004  8316 layer_factory.hpp:74] Creating layer pool1
-I0906 13:34:47.913022  8316 net.cpp:91] Creating Layer pool1
-I0906 13:34:47.913027  8316 net.cpp:411] pool1 <- norm1
-I0906 13:34:47.913040  8316 net.cpp:369] pool1 -> pool1
-I0906 13:34:47.913050  8316 net.cpp:121] Setting up pool1
-I0906 13:34:47.913069  8316 net.cpp:128] Top shape: 50 96 27 27 (3499200)
-I0906 13:34:47.913074  8316 net.cpp:134] Memory required for data: 219154800
-I0906 13:34:47.913079  8316 layer_factory.hpp:74] Creating layer conv2
-I0906 13:34:47.913091  8316 net.cpp:91] Creating Layer conv2
-I0906 13:34:47.913096  8316 net.cpp:411] conv2 <- pool1
-I0906 13:34:47.913111  8316 net.cpp:369] conv2 -> conv2
-I0906 13:34:47.913123  8316 net.cpp:121] Setting up conv2
-I0906 13:34:47.952414  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:47.952428  8316 net.cpp:134] Memory required for data: 256479600
-I0906 13:34:47.952455  8316 layer_factory.hpp:74] Creating layer relu2
-I0906 13:34:47.952477  8316 net.cpp:91] Creating Layer relu2
-I0906 13:34:47.952487  8316 net.cpp:411] relu2 <- conv2
-I0906 13:34:47.952507  8316 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:34:47.952518  8316 net.cpp:121] Setting up relu2
-I0906 13:34:47.952527  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:47.952532  8316 net.cpp:134] Memory required for data: 293804400
-I0906 13:34:47.952536  8316 layer_factory.hpp:74] Creating layer norm2
-I0906 13:34:47.952558  8316 net.cpp:91] Creating Layer norm2
-I0906 13:34:47.952564  8316 net.cpp:411] norm2 <- conv2
-I0906 13:34:47.952577  8316 net.cpp:369] norm2 -> norm2
-I0906 13:34:47.952591  8316 net.cpp:121] Setting up norm2
-I0906 13:34:47.952610  8316 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:34:47.952615  8316 net.cpp:134] Memory required for data: 331129200
-I0906 13:34:47.952620  8316 layer_factory.hpp:74] Creating layer pool2
-I0906 13:34:47.952635  8316 net.cpp:91] Creating Layer pool2
-I0906 13:34:47.952641  8316 net.cpp:411] pool2 <- norm2
-I0906 13:34:47.952653  8316 net.cpp:369] pool2 -> pool2
-I0906 13:34:47.952663  8316 net.cpp:121] Setting up pool2
-I0906 13:34:47.952682  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:47.952685  8316 net.cpp:134] Memory required for data: 339782000
-I0906 13:34:47.952690  8316 layer_factory.hpp:74] Creating layer conv3
-I0906 13:34:47.952713  8316 net.cpp:91] Creating Layer conv3
-I0906 13:34:47.952718  8316 net.cpp:411] conv3 <- pool2
-I0906 13:34:47.952733  8316 net.cpp:369] conv3 -> conv3
-I0906 13:34:47.952744  8316 net.cpp:121] Setting up conv3
-I0906 13:34:48.002686  8321 data_layer.cpp:120] Prefetch batch: 94 ms.
-I0906 13:34:48.002718  8321 data_layer.cpp:121]      Read time: 12.003 ms.
-I0906 13:34:48.002725  8321 data_layer.cpp:122] Transform time: 81.802 ms.
-I0906 13:34:48.066742  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:48.066764  8316 net.cpp:134] Memory required for data: 352761200
-I0906 13:34:48.066805  8316 layer_factory.hpp:74] Creating layer relu3
-I0906 13:34:48.066839  8316 net.cpp:91] Creating Layer relu3
-I0906 13:34:48.066854  8316 net.cpp:411] relu3 <- conv3
-I0906 13:34:48.066880  8316 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:34:48.066897  8316 net.cpp:121] Setting up relu3
-I0906 13:34:48.066906  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:48.066910  8316 net.cpp:134] Memory required for data: 365740400
-I0906 13:34:48.066915  8316 layer_factory.hpp:74] Creating layer conv4
-I0906 13:34:48.066942  8316 net.cpp:91] Creating Layer conv4
-I0906 13:34:48.066947  8316 net.cpp:411] conv4 <- conv3
-I0906 13:34:48.066964  8316 net.cpp:369] conv4 -> conv4
-I0906 13:34:48.066979  8316 net.cpp:121] Setting up conv4
-I0906 13:34:48.151291  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:48.151312  8316 net.cpp:134] Memory required for data: 378719600
-I0906 13:34:48.151340  8316 layer_factory.hpp:74] Creating layer relu4
-I0906 13:34:48.151372  8316 net.cpp:91] Creating Layer relu4
-I0906 13:34:48.151430  8316 net.cpp:411] relu4 <- conv4
-I0906 13:34:48.151458  8316 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:34:48.151473  8316 net.cpp:121] Setting up relu4
-I0906 13:34:48.151482  8316 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:34:48.151486  8316 net.cpp:134] Memory required for data: 391698800
-I0906 13:34:48.151491  8316 layer_factory.hpp:74] Creating layer conv5
-I0906 13:34:48.151517  8316 net.cpp:91] Creating Layer conv5
-I0906 13:34:48.151523  8316 net.cpp:411] conv5 <- conv4
-I0906 13:34:48.151540  8316 net.cpp:369] conv5 -> conv5
-I0906 13:34:48.151554  8316 net.cpp:121] Setting up conv5
-I0906 13:34:48.208228  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:48.208250  8316 net.cpp:134] Memory required for data: 400351600
-I0906 13:34:48.208292  8316 layer_factory.hpp:74] Creating layer relu5
-I0906 13:34:48.208322  8316 net.cpp:91] Creating Layer relu5
-I0906 13:34:48.208336  8316 net.cpp:411] relu5 <- conv5
-I0906 13:34:48.208360  8316 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:34:48.208376  8316 net.cpp:121] Setting up relu5
-I0906 13:34:48.208385  8316 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:34:48.208389  8316 net.cpp:134] Memory required for data: 409004400
-I0906 13:34:48.208395  8316 layer_factory.hpp:74] Creating layer pool5
-I0906 13:34:48.208425  8316 net.cpp:91] Creating Layer pool5
-I0906 13:34:48.208431  8316 net.cpp:411] pool5 <- conv5
-I0906 13:34:48.208446  8316 net.cpp:369] pool5 -> pool5
-I0906 13:34:48.208459  8316 net.cpp:121] Setting up pool5
-I0906 13:34:48.208479  8316 net.cpp:128] Top shape: 50 256 6 6 (460800)
-I0906 13:34:48.208483  8316 net.cpp:134] Memory required for data: 410847600
-I0906 13:34:48.208488  8316 layer_factory.hpp:74] Creating layer fc6
-I0906 13:34:48.208510  8316 net.cpp:91] Creating Layer fc6
-I0906 13:34:48.208516  8316 net.cpp:411] fc6 <- pool5
-I0906 13:34:48.208530  8316 net.cpp:369] fc6 -> fc6
-I0906 13:34:48.208544  8316 net.cpp:121] Setting up fc6
-I0906 13:34:52.951850  8316 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:52.951876  8316 net.cpp:134] Memory required for data: 411666800
-I0906 13:34:52.951903  8316 layer_factory.hpp:74] Creating layer relu6
-I0906 13:34:52.951944  8316 net.cpp:91] Creating Layer relu6
-I0906 13:34:52.951961  8316 net.cpp:411] relu6 <- fc6
-I0906 13:34:52.951987  8316 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:34:52.952003  8316 net.cpp:121] Setting up relu6
-I0906 13:34:52.952010  8316 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:52.952014  8316 net.cpp:134] Memory required for data: 412486000
-I0906 13:34:52.952019  8316 layer_factory.hpp:74] Creating layer fc7
-I0906 13:34:52.952044  8316 net.cpp:91] Creating Layer fc7
-I0906 13:34:52.952049  8316 net.cpp:411] fc7 <- fc6
-I0906 13:34:52.952065  8316 net.cpp:369] fc7 -> fc7
-I0906 13:34:52.952080  8316 net.cpp:121] Setting up fc7
-I0906 13:34:55.059911  8316 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:55.059948  8316 net.cpp:134] Memory required for data: 413305200
-I0906 13:34:55.059976  8316 layer_factory.hpp:74] Creating layer relu7
-I0906 13:34:55.060010  8316 net.cpp:91] Creating Layer relu7
-I0906 13:34:55.060025  8316 net.cpp:411] relu7 <- fc7
-I0906 13:34:55.060053  8316 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:34:55.060070  8316 net.cpp:121] Setting up relu7
-I0906 13:34:55.060078  8316 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:34:55.060082  8316 net.cpp:134] Memory required for data: 414124400
-I0906 13:34:55.060087  8316 layer_factory.hpp:74] Creating layer fc8
-I0906 13:34:55.060109  8316 net.cpp:91] Creating Layer fc8
-I0906 13:34:55.060116  8316 net.cpp:411] fc8 <- fc7
-I0906 13:34:55.060132  8316 net.cpp:369] fc8 -> fc8
-I0906 13:34:55.060156  8316 net.cpp:121] Setting up fc8
-I0906 13:34:55.576926  8316 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:55.576946  8316 net.cpp:134] Memory required for data: 414324400
-I0906 13:34:55.576972  8316 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
-I0906 13:34:55.577006  8316 net.cpp:91] Creating Layer fc8_fc8_0_split
-I0906 13:34:55.577097  8316 net.cpp:411] fc8_fc8_0_split <- fc8
-I0906 13:34:55.577136  8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
-I0906 13:34:55.577162  8316 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
-I0906 13:34:55.577173  8316 net.cpp:121] Setting up fc8_fc8_0_split
-I0906 13:34:55.577191  8316 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:55.577198  8316 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:34:55.577201  8316 net.cpp:134] Memory required for data: 414724400
-I0906 13:34:55.577206  8316 layer_factory.hpp:74] Creating layer accuracy
-I0906 13:34:55.577237  8316 net.cpp:91] Creating Layer accuracy
-I0906 13:34:55.577244  8316 net.cpp:411] accuracy <- fc8_fc8_0_split_0
-I0906 13:34:55.577255  8316 net.cpp:411] accuracy <- label_data_1_split_0
-I0906 13:34:55.577266  8316 net.cpp:369] accuracy -> accuracy
-I0906 13:34:55.577277  8316 net.cpp:121] Setting up accuracy
-I0906 13:34:55.577293  8316 net.cpp:128] Top shape: (1)
-I0906 13:34:55.577297  8316 net.cpp:134] Memory required for data: 414724404
-I0906 13:34:55.577302  8316 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:55.577314  8316 net.cpp:91] Creating Layer loss
-I0906 13:34:55.577321  8316 net.cpp:411] loss <- fc8_fc8_0_split_1
-I0906 13:34:55.577332  8316 net.cpp:411] loss <- label_data_1_split_1
-I0906 13:34:55.577342  8316 net.cpp:369] loss -> loss
-I0906 13:34:55.577353  8316 net.cpp:121] Setting up loss
-I0906 13:34:55.577363  8316 layer_factory.hpp:74] Creating layer loss
-I0906 13:34:55.577759  8316 net.cpp:128] Top shape: (1)
-I0906 13:34:55.577764  8316 net.cpp:130]     with loss weight 1
-I0906 13:34:55.577780  8316 net.cpp:134] Memory required for data: 414724408
-I0906 13:34:55.577786  8316 net.cpp:193] loss needs backward computation.
-I0906 13:34:55.577795  8316 net.cpp:195] accuracy does not need backward computation.
-I0906 13:34:55.577801  8316 net.cpp:193] fc8_fc8_0_split needs backward computation.
-I0906 13:34:55.577807  8316 net.cpp:193] fc8 needs backward computation.
-I0906 13:34:55.577813  8316 net.cpp:193] relu7 needs backward computation.
-I0906 13:34:55.577818  8316 net.cpp:193] fc7 needs backward computation.
-I0906 13:34:55.577824  8316 net.cpp:193] relu6 needs backward computation.
-I0906 13:34:55.577831  8316 net.cpp:193] fc6 needs backward computation.
-I0906 13:34:55.577836  8316 net.cpp:193] pool5 needs backward computation.
-I0906 13:34:55.577842  8316 net.cpp:193] relu5 needs backward computation.
-I0906 13:34:55.577847  8316 net.cpp:193] conv5 needs backward computation.
-I0906 13:34:55.577853  8316 net.cpp:193] relu4 needs backward computation.
-I0906 13:34:55.577859  8316 net.cpp:193] conv4 needs backward computation.
-I0906 13:34:55.577864  8316 net.cpp:193] relu3 needs backward computation.
-I0906 13:34:55.577870  8316 net.cpp:193] conv3 needs backward computation.
-I0906 13:34:55.577877  8316 net.cpp:193] pool2 needs backward computation.
-I0906 13:34:55.577883  8316 net.cpp:193] norm2 needs backward computation.
-I0906 13:34:55.577888  8316 net.cpp:193] relu2 needs backward computation.
-I0906 13:34:55.577893  8316 net.cpp:193] conv2 needs backward computation.
-I0906 13:34:55.577899  8316 net.cpp:193] pool1 needs backward computation.
-I0906 13:34:55.577905  8316 net.cpp:193] norm1 needs backward computation.
-I0906 13:34:55.577911  8316 net.cpp:193] relu1 needs backward computation.
-I0906 13:34:55.577916  8316 net.cpp:193] conv1 needs backward computation.
-I0906 13:34:55.577924  8316 net.cpp:195] label_data_1_split does not need backward computation.
-I0906 13:34:55.577931  8316 net.cpp:195] data does not need backward computation.
-I0906 13:34:55.577936  8316 net.cpp:236] This network produces output accuracy
-I0906 13:34:55.577942  8316 net.cpp:236] This network produces output loss
-I0906 13:34:55.577977  8316 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:34:55.577991  8316 net.cpp:248] Network initialization done.
-I0906 13:34:55.577996  8316 net.cpp:249] Memory required for data: 414724408
-I0906 13:34:55.578182  8316 solver.cpp:53] Solver scaffolding done.
-I0906 13:34:55.578306  8316 solver.cpp:270] Solving AlexNet
-I0906 13:34:55.578330  8316 solver.cpp:271] Learning Rate Policy: step
-I0906 13:34:55.580096  8316 solver.cpp:314] Iteration 0, Testing net (#0)
-I0906 13:34:55.580111  8316 net.cpp:696] Copying source layer data
-I0906 13:34:55.580116  8316 net.cpp:696] Copying source layer conv1
-I0906 13:34:55.583168  8316 net.cpp:696] Copying source layer relu1
-I0906 13:34:55.583199  8316 net.cpp:696] Copying source layer norm1
-I0906 13:34:55.583204  8316 net.cpp:696] Copying source layer pool1
-I0906 13:34:55.583209  8316 net.cpp:696] Copying source layer conv2
-I0906 13:34:55.583320  8316 net.cpp:696] Copying source layer relu2
-I0906 13:34:55.583326  8316 net.cpp:696] Copying source layer norm2
-I0906 13:34:55.583331  8316 net.cpp:696] Copying source layer pool2
-I0906 13:34:55.583335  8316 net.cpp:696] Copying source layer conv3
-I0906 13:34:55.583690  8316 net.cpp:696] Copying source layer relu3
-I0906 13:34:55.583698  8316 net.cpp:696] Copying source layer conv4
-I0906 13:34:55.583895  8316 net.cpp:696] Copying source layer relu4
-I0906 13:34:55.583902  8316 net.cpp:696] Copying source layer conv5
-I0906 13:34:55.584177  8316 net.cpp:696] Copying source layer relu5
-I0906 13:34:55.584185  8316 net.cpp:696] Copying source layer pool5
-I0906 13:34:55.584189  8316 net.cpp:696] Copying source layer fc6
-I0906 13:34:55.589432  8316 net.cpp:696] Copying source layer relu6
-I0906 13:34:55.589460  8316 net.cpp:696] Copying source layer fc7
-I0906 13:34:55.592273  8316 net.cpp:696] Copying source layer relu7
-I0906 13:34:55.592288  8316 net.cpp:696] Copying source layer fc8
-I0906 13:34:55.593138  8316 net.cpp:696] Copying source layer loss
-I0906 13:34:55.593260  8316 base_data_layer.cpp:89] Thread joined
-I0906 13:34:55.597589  8316 base_data_layer.cpp:93] Prefetch copied
-I0906 13:34:55.597887  8316 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:34:55.695569  8322 data_layer.cpp:120] Prefetch batch: 97 ms.
-I0906 13:34:55.695600  8322 data_layer.cpp:121]      Read time: 13.209 ms.
-I0906 13:34:55.695606  8322 data_layer.cpp:122] Transform time: 83.025 ms.
-I0906 13:34:58.623245  8316 solver.cpp:363]     Test net output #0: accuracy = 0
-I0906 13:34:58.623273  8316 solver.cpp:363]     Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss)
-I0906 13:34:58.623322  8316 base_data_layer.cpp:89] Thread joined
-I0906 13:34:58.632244  8316 base_data_layer.cpp:93] Prefetch copied
-I0906 13:34:58.632606  8316 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:34:58.819707  8323 data_layer.cpp:120] Prefetch batch: 186 ms.
-I0906 13:34:58.819741  8323 data_layer.cpp:121]      Read time: 24.148 ms.
-I0906 13:34:58.819747  8323 data_layer.cpp:122] Transform time: 161.152 ms.
-I0906 13:35:05.407784  8316 solver.cpp:234] Iteration 0, loss = 0
-I0906 13:35:05.407842  8316 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
-I0906 13:35:05.407891  8316 solver.cpp:506] Iteration 0, lr = 0.01
-I0906 13:35:05.525874  8316 base_data_layer.cpp:89] Thread joined
-I0906 13:35:05.533869  8316 base_data_layer.cpp:93] Prefetch copied
-I0906 13:35:05.534140  8316 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:35:05.722632  8328 data_layer.cpp:120] Prefetch batch: 188 ms.
-I0906 13:35:05.722664  8328 data_layer.cpp:121]      Read time: 24.184 ms.
-I0906 13:35:05.722672  8328 data_layer.cpp:122] Transform time: 162.257 ms.
-I0906 13:35:08.300590  8316 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
deleted file mode 100644
index 6ec81c82..00000000
--- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135805.16515
+++ /dev/null
@@ -1,1160 +0,0 @@
-Log file created at: 2015/09/06 13:58:05
-Running on machine: AMD-RESEARCH
-Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
-I0906 13:58:05.835170 16515 caffe.cpp:114] Use GPU with device ID 0
-I0906 13:58:05.875704 16515 device.cpp:230] Number of platforms found:1
-I0906 13:58:05.875743 16515 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
-I0906 13:58:05.875757 16515 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
-I0906 13:58:05.875763 16515 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
-I0906 13:58:05.875769 16515 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
-I0906 13:58:05.875774 16515 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
-I0906 13:58:05.875783 16515 device.cpp:286] Number of devices found:1
-I0906 13:58:05.875788 16515 device.cpp:288] 	DeviceID:	0x18ab2f0
-I0906 13:58:05.875809 16515 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
-I0906 13:58:05.875818 16515 device.cpp:393] 	Is it integrated GPU?:	0
-I0906 13:58:05.875823 16515 device.cpp:393] 	Max clock frequency MHz:	930
-I0906 13:58:05.875829 16515 device.cpp:393] 	Host-Device unified mem:	0
-I0906 13:58:05.875834 16515 device.cpp:393] 	ECC support:	0
-I0906 13:58:05.875839 16515 device.cpp:393] 	Endian little:	1
-I0906 13:58:05.875844 16515 device.cpp:393] 	Max compute units:	44
-I0906 13:58:05.875849 16515 device.cpp:393] 	Max work group size:	256
-I0906 13:58:05.875856 16515 device.cpp:393] 	Max work item dimensions:	3
-I0906 13:58:05.875862 16515 device.cpp:393] 	Max work item sizes:	0x100
-I0906 13:58:05.875869 16515 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
-I0906 13:58:05.875875 16515 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
-I0906 13:58:05.875881 16515 device.cpp:393] 	Max mem alloc size:	4244635648
-I0906 13:58:05.875886 16515 device.cpp:393] 	Global mem size:	16878927872
-I0906 13:58:05.875891 16515 device.cpp:393] 	Local mem size:	32768
-I0906 13:58:05.875902 16515 device.cpp:96] Picked device type : GPU 0
-I0906 13:58:08.267483 16515 device.cpp:152] Build Program
-I0906 13:58:08.267706 16515 caffe.cpp:122] Starting Optimization
-I0906 13:58:08.267797 16515 solver.cpp:40] Initializing solver from parameters: 
-test_iter: 1
-test_interval: 1000
-base_lr: 0.01
-display: 1
-max_iter: 10
-lr_policy: "step"
-gamma: 0.1
-momentum: 0.9
-weight_decay: 0.0005
-stepsize: 100000
-snapshot: 10000
-snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
-solver_mode: GPU
-net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
-I0906 13:58:08.267910 16515 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:58:08.269042 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
-I0906 13:58:08.269093 16515 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
-I0906 13:58:08.269273 16515 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TRAIN
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:58:08.269708 16515 net.cpp:68] Memory required for data: 0
-I0906 13:58:08.269917 16515 layer_factory.hpp:74] Creating layer data
-I0906 13:58:08.269971 16515 net.cpp:91] Creating Layer data
-I0906 13:58:08.269992 16515 net.cpp:369] data -> data
-I0906 13:58:08.270097 16515 net.cpp:369] data -> label
-I0906 13:58:08.270122 16515 net.cpp:121] Setting up data
-I0906 13:58:08.270134 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:58:08.279337 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
-I0906 13:58:08.279680 16515 data_layer.cpp:53] output data size: 100,3,227,227
-I0906 13:58:08.311036 16515 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:58:08.311240 16515 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:58:08.311303 16515 net.cpp:128] Top shape: 100 3 227 227 (15458700)
-I0906 13:58:08.311313 16515 net.cpp:128] Top shape: 100 (100)
-I0906 13:58:08.311318 16515 net.cpp:134] Memory required for data: 61835200
-I0906 13:58:08.311352 16515 layer_factory.hpp:74] Creating layer conv1
-I0906 13:58:08.311431 16515 net.cpp:91] Creating Layer conv1
-I0906 13:58:08.311453 16515 net.cpp:411] conv1 <- data
-I0906 13:58:08.311504 16515 net.cpp:369] conv1 -> conv1
-I0906 13:58:08.311569 16515 net.cpp:121] Setting up conv1
-I0906 13:58:08.316509 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:08.316515 16515 net.cpp:134] Memory required for data: 177995200
-I0906 13:58:08.316555 16515 layer_factory.hpp:74] Creating layer relu1
-I0906 13:58:08.316577 16515 net.cpp:91] Creating Layer relu1
-I0906 13:58:08.316583 16515 net.cpp:411] relu1 <- conv1
-I0906 13:58:08.316597 16515 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:58:08.316606 16515 net.cpp:121] Setting up relu1
-I0906 13:58:08.316615 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:08.316619 16515 net.cpp:134] Memory required for data: 294155200
-I0906 13:58:08.316623 16515 layer_factory.hpp:74] Creating layer norm1
-I0906 13:58:08.316653 16515 net.cpp:91] Creating Layer norm1
-I0906 13:58:08.316659 16515 net.cpp:411] norm1 <- conv1
-I0906 13:58:08.316673 16515 net.cpp:369] norm1 -> norm1
-I0906 13:58:08.316686 16515 net.cpp:121] Setting up norm1
-I0906 13:58:08.316710 16515 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:08.316715 16515 net.cpp:134] Memory required for data: 410315200
-I0906 13:58:08.316720 16515 layer_factory.hpp:74] Creating layer pool1
-I0906 13:58:08.316745 16515 net.cpp:91] Creating Layer pool1
-I0906 13:58:08.316750 16515 net.cpp:411] pool1 <- norm1
-I0906 13:58:08.316763 16515 net.cpp:369] pool1 -> pool1
-I0906 13:58:08.316776 16515 net.cpp:121] Setting up pool1
-I0906 13:58:08.316805 16515 net.cpp:128] Top shape: 100 96 27 27 (6998400)
-I0906 13:58:08.316809 16515 net.cpp:134] Memory required for data: 438308800
-I0906 13:58:08.316814 16515 layer_factory.hpp:74] Creating layer conv2
-I0906 13:58:08.316829 16515 net.cpp:91] Creating Layer conv2
-I0906 13:58:08.316834 16515 net.cpp:411] conv2 <- pool1
-I0906 13:58:08.316850 16515 net.cpp:369] conv2 -> conv2
-I0906 13:58:08.316862 16515 net.cpp:121] Setting up conv2
-I0906 13:58:08.356899 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:08.356914 16515 net.cpp:134] Memory required for data: 512958400
-I0906 13:58:08.356945 16515 layer_factory.hpp:74] Creating layer relu2
-I0906 13:58:08.356967 16515 net.cpp:91] Creating Layer relu2
-I0906 13:58:08.356978 16515 net.cpp:411] relu2 <- conv2
-I0906 13:58:08.356998 16515 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:58:08.357012 16515 net.cpp:121] Setting up relu2
-I0906 13:58:08.357022 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:08.357025 16515 net.cpp:134] Memory required for data: 587608000
-I0906 13:58:08.357030 16515 layer_factory.hpp:74] Creating layer norm2
-I0906 13:58:08.357046 16515 net.cpp:91] Creating Layer norm2
-I0906 13:58:08.357053 16515 net.cpp:411] norm2 <- conv2
-I0906 13:58:08.357066 16515 net.cpp:369] norm2 -> norm2
-I0906 13:58:08.357079 16515 net.cpp:121] Setting up norm2
-I0906 13:58:08.357108 16515 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:08.357113 16515 net.cpp:134] Memory required for data: 662257600
-I0906 13:58:08.357118 16515 layer_factory.hpp:74] Creating layer pool2
-I0906 13:58:08.357146 16515 net.cpp:91] Creating Layer pool2
-I0906 13:58:08.357152 16515 net.cpp:411] pool2 <- norm2
-I0906 13:58:08.357166 16515 net.cpp:369] pool2 -> pool2
-I0906 13:58:08.357177 16515 net.cpp:121] Setting up pool2
-I0906 13:58:08.357200 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:08.357204 16515 net.cpp:134] Memory required for data: 679563200
-I0906 13:58:08.357259 16515 layer_factory.hpp:74] Creating layer conv3
-I0906 13:58:08.357281 16515 net.cpp:91] Creating Layer conv3
-I0906 13:58:08.357287 16515 net.cpp:411] conv3 <- pool2
-I0906 13:58:08.357303 16515 net.cpp:369] conv3 -> conv3
-I0906 13:58:08.357318 16515 net.cpp:121] Setting up conv3
-I0906 13:58:08.475977 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:08.475999 16515 net.cpp:134] Memory required for data: 705521600
-I0906 13:58:08.476043 16515 layer_factory.hpp:74] Creating layer relu3
-I0906 13:58:08.476078 16515 net.cpp:91] Creating Layer relu3
-I0906 13:58:08.476093 16515 net.cpp:411] relu3 <- conv3
-I0906 13:58:08.476120 16515 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:58:08.476137 16515 net.cpp:121] Setting up relu3
-I0906 13:58:08.476147 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:08.476151 16515 net.cpp:134] Memory required for data: 731480000
-I0906 13:58:08.476156 16515 layer_factory.hpp:74] Creating layer conv4
-I0906 13:58:08.476184 16515 net.cpp:91] Creating Layer conv4
-I0906 13:58:08.476191 16515 net.cpp:411] conv4 <- conv3
-I0906 13:58:08.476207 16515 net.cpp:369] conv4 -> conv4
-I0906 13:58:08.476222 16515 net.cpp:121] Setting up conv4
-I0906 13:58:08.500998 16519 data_layer.cpp:120] Prefetch batch: 189 ms.
-I0906 13:58:08.501045 16519 data_layer.cpp:121]      Read time: 23.893 ms.
-I0906 13:58:08.501054 16519 data_layer.cpp:122] Transform time: 163.51 ms.
-I0906 13:58:08.563753 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:08.563774 16515 net.cpp:134] Memory required for data: 757438400
-I0906 13:58:08.563802 16515 layer_factory.hpp:74] Creating layer relu4
-I0906 13:58:08.563835 16515 net.cpp:91] Creating Layer relu4
-I0906 13:58:08.563849 16515 net.cpp:411] relu4 <- conv4
-I0906 13:58:08.563876 16515 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:58:08.563892 16515 net.cpp:121] Setting up relu4
-I0906 13:58:08.563902 16515 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:08.563906 16515 net.cpp:134] Memory required for data: 783396800
-I0906 13:58:08.563911 16515 layer_factory.hpp:74] Creating layer conv5
-I0906 13:58:08.563946 16515 net.cpp:91] Creating Layer conv5
-I0906 13:58:08.563951 16515 net.cpp:411] conv5 <- conv4
-I0906 13:58:08.563968 16515 net.cpp:369] conv5 -> conv5
-I0906 13:58:08.563982 16515 net.cpp:121] Setting up conv5
-I0906 13:58:08.621495 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:08.621512 16515 net.cpp:134] Memory required for data: 800702400
-I0906 13:58:08.621553 16515 layer_factory.hpp:74] Creating layer relu5
-I0906 13:58:08.621584 16515 net.cpp:91] Creating Layer relu5
-I0906 13:58:08.621598 16515 net.cpp:411] relu5 <- conv5
-I0906 13:58:08.621623 16515 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:58:08.621639 16515 net.cpp:121] Setting up relu5
-I0906 13:58:08.621649 16515 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:08.621652 16515 net.cpp:134] Memory required for data: 818008000
-I0906 13:58:08.621657 16515 layer_factory.hpp:74] Creating layer pool5
-I0906 13:58:08.621677 16515 net.cpp:91] Creating Layer pool5
-I0906 13:58:08.621683 16515 net.cpp:411] pool5 <- conv5
-I0906 13:58:08.621697 16515 net.cpp:369] pool5 -> pool5
-I0906 13:58:08.621711 16515 net.cpp:121] Setting up pool5
-I0906 13:58:08.621732 16515 net.cpp:128] Top shape: 100 256 6 6 (921600)
-I0906 13:58:08.621737 16515 net.cpp:134] Memory required for data: 821694400
-I0906 13:58:08.621742 16515 layer_factory.hpp:74] Creating layer fc6
-I0906 13:58:08.621778 16515 net.cpp:91] Creating Layer fc6
-I0906 13:58:08.621783 16515 net.cpp:411] fc6 <- pool5
-I0906 13:58:08.621798 16515 net.cpp:369] fc6 -> fc6
-I0906 13:58:08.621812 16515 net.cpp:121] Setting up fc6
-I0906 13:58:13.492439 16515 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:58:13.492465 16515 net.cpp:134] Memory required for data: 823332800
-I0906 13:58:13.492493 16515 layer_factory.hpp:74] Creating layer relu6
-I0906 13:58:13.492527 16515 net.cpp:91] Creating Layer relu6
-I0906 13:58:13.492542 16515 net.cpp:411] relu6 <- fc6
-I0906 13:58:13.492568 16515 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:58:13.492630 16515 net.cpp:121] Setting up relu6
-I0906 13:58:13.492640 16515 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:58:13.492643 16515 net.cpp:134] Memory required for data: 824971200
-I0906 13:58:13.492648 16515 layer_factory.hpp:74] Creating layer fc7
-I0906 13:58:13.492671 16515 net.cpp:91] Creating Layer fc7
-I0906 13:58:13.492677 16515 net.cpp:411] fc7 <- fc6
-I0906 13:58:13.492693 16515 net.cpp:369] fc7 -> fc7
-I0906 13:58:13.492708 16515 net.cpp:121] Setting up fc7
-I0906 13:58:15.661120 16515 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:58:15.661144 16515 net.cpp:134] Memory required for data: 826609600
-I0906 13:58:15.661171 16515 layer_factory.hpp:74] Creating layer relu7
-I0906 13:58:15.661205 16515 net.cpp:91] Creating Layer relu7
-I0906 13:58:15.661221 16515 net.cpp:411] relu7 <- fc7
-I0906 13:58:15.661247 16515 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:58:15.661263 16515 net.cpp:121] Setting up relu7
-I0906 13:58:15.661273 16515 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:58:15.661276 16515 net.cpp:134] Memory required for data: 828248000
-I0906 13:58:15.661281 16515 layer_factory.hpp:74] Creating layer fc8
-I0906 13:58:15.661304 16515 net.cpp:91] Creating Layer fc8
-I0906 13:58:15.661310 16515 net.cpp:411] fc8 <- fc7
-I0906 13:58:15.661325 16515 net.cpp:369] fc8 -> fc8
-I0906 13:58:15.661340 16515 net.cpp:121] Setting up fc8
-I0906 13:58:16.190832 16515 net.cpp:128] Top shape: 100 1000 (100000)
-I0906 13:58:16.190855 16515 net.cpp:134] Memory required for data: 828648000
-I0906 13:58:16.190881 16515 layer_factory.hpp:74] Creating layer loss
-I0906 13:58:16.190932 16515 net.cpp:91] Creating Layer loss
-I0906 13:58:16.190946 16515 net.cpp:411] loss <- fc8
-I0906 13:58:16.190969 16515 net.cpp:411] loss <- label
-I0906 13:58:16.190989 16515 net.cpp:369] loss -> loss
-I0906 13:58:16.191009 16515 net.cpp:121] Setting up loss
-I0906 13:58:16.191030 16515 layer_factory.hpp:74] Creating layer loss
-I0906 13:58:16.191588 16515 net.cpp:128] Top shape: (1)
-I0906 13:58:16.191593 16515 net.cpp:130]     with loss weight 1
-I0906 13:58:16.191611 16515 net.cpp:134] Memory required for data: 828648004
-I0906 13:58:16.191619 16515 net.cpp:193] loss needs backward computation.
-I0906 13:58:16.191627 16515 net.cpp:193] fc8 needs backward computation.
-I0906 13:58:16.191633 16515 net.cpp:193] relu7 needs backward computation.
-I0906 13:58:16.191639 16515 net.cpp:193] fc7 needs backward computation.
-I0906 13:58:16.191644 16515 net.cpp:193] relu6 needs backward computation.
-I0906 13:58:16.191650 16515 net.cpp:193] fc6 needs backward computation.
-I0906 13:58:16.191655 16515 net.cpp:193] pool5 needs backward computation.
-I0906 13:58:16.191661 16515 net.cpp:193] relu5 needs backward computation.
-I0906 13:58:16.191666 16515 net.cpp:193] conv5 needs backward computation.
-I0906 13:58:16.191673 16515 net.cpp:193] relu4 needs backward computation.
-I0906 13:58:16.191678 16515 net.cpp:193] conv4 needs backward computation.
-I0906 13:58:16.191684 16515 net.cpp:193] relu3 needs backward computation.
-I0906 13:58:16.191689 16515 net.cpp:193] conv3 needs backward computation.
-I0906 13:58:16.191696 16515 net.cpp:193] pool2 needs backward computation.
-I0906 13:58:16.191702 16515 net.cpp:193] norm2 needs backward computation.
-I0906 13:58:16.191709 16515 net.cpp:193] relu2 needs backward computation.
-I0906 13:58:16.191714 16515 net.cpp:193] conv2 needs backward computation.
-I0906 13:58:16.191720 16515 net.cpp:193] pool1 needs backward computation.
-I0906 13:58:16.191725 16515 net.cpp:193] norm1 needs backward computation.
-I0906 13:58:16.191731 16515 net.cpp:193] relu1 needs backward computation.
-I0906 13:58:16.191737 16515 net.cpp:193] conv1 needs backward computation.
-I0906 13:58:16.191745 16515 net.cpp:195] data does not need backward computation.
-I0906 13:58:16.191753 16515 net.cpp:236] This network produces output loss
-I0906 13:58:16.191787 16515 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:58:16.191803 16515 net.cpp:248] Network initialization done.
-I0906 13:58:16.191807 16515 net.cpp:249] Memory required for data: 828648004
-I0906 13:58:16.192769 16515 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:58:16.192881 16515 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
-I0906 13:58:16.193114 16515 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TEST
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "fc8"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:58:16.193480 16515 net.cpp:68] Memory required for data: 0
-I0906 13:58:16.193527 16515 layer_factory.hpp:74] Creating layer data
-I0906 13:58:16.193549 16515 net.cpp:91] Creating Layer data
-I0906 13:58:16.193559 16515 net.cpp:369] data -> data
-I0906 13:58:16.193583 16515 net.cpp:369] data -> label
-I0906 13:58:16.193595 16515 net.cpp:121] Setting up data
-I0906 13:58:16.193603 16515 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:58:16.202100 16515 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
-I0906 13:58:16.202343 16515 data_layer.cpp:53] output data size: 50,3,227,227
-I0906 13:58:16.219017 16515 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:58:16.219137 16515 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:58:16.219171 16515 net.cpp:128] Top shape: 50 3 227 227 (7729350)
-I0906 13:58:16.219179 16515 net.cpp:128] Top shape: 50 (50)
-I0906 13:58:16.219183 16515 net.cpp:134] Memory required for data: 30917600
-I0906 13:58:16.219214 16515 layer_factory.hpp:74] Creating layer label_data_1_split
-I0906 13:58:16.219279 16515 net.cpp:91] Creating Layer label_data_1_split
-I0906 13:58:16.219293 16515 net.cpp:411] label_data_1_split <- label
-I0906 13:58:16.219367 16515 net.cpp:369] label_data_1_split -> label_data_1_split_0
-I0906 13:58:16.219409 16515 net.cpp:369] label_data_1_split -> label_data_1_split_1
-I0906 13:58:16.219420 16515 net.cpp:121] Setting up label_data_1_split
-I0906 13:58:16.219455 16515 net.cpp:128] Top shape: 50 (50)
-I0906 13:58:16.219462 16515 net.cpp:128] Top shape: 50 (50)
-I0906 13:58:16.219466 16515 net.cpp:134] Memory required for data: 30918000
-I0906 13:58:16.219471 16515 layer_factory.hpp:74] Creating layer conv1
-I0906 13:58:16.219508 16515 net.cpp:91] Creating Layer conv1
-I0906 13:58:16.219513 16515 net.cpp:411] conv1 <- data
-I0906 13:58:16.219530 16515 net.cpp:369] conv1 -> conv1
-I0906 13:58:16.219545 16515 net.cpp:121] Setting up conv1
-I0906 13:58:16.224315 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:58:16.224321 16515 net.cpp:134] Memory required for data: 88998000
-I0906 13:58:16.224341 16515 layer_factory.hpp:74] Creating layer relu1
-I0906 13:58:16.224354 16515 net.cpp:91] Creating Layer relu1
-I0906 13:58:16.224360 16515 net.cpp:411] relu1 <- conv1
-I0906 13:58:16.224372 16515 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:58:16.224382 16515 net.cpp:121] Setting up relu1
-I0906 13:58:16.224390 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:58:16.224393 16515 net.cpp:134] Memory required for data: 147078000
-I0906 13:58:16.224398 16515 layer_factory.hpp:74] Creating layer norm1
-I0906 13:58:16.224417 16515 net.cpp:91] Creating Layer norm1
-I0906 13:58:16.224423 16515 net.cpp:411] norm1 <- conv1
-I0906 13:58:16.224436 16515 net.cpp:369] norm1 -> norm1
-I0906 13:58:16.224447 16515 net.cpp:121] Setting up norm1
-I0906 13:58:16.224465 16515 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:58:16.224508 16515 net.cpp:134] Memory required for data: 205158000
-I0906 13:58:16.224514 16515 layer_factory.hpp:74] Creating layer pool1
-I0906 13:58:16.224529 16515 net.cpp:91] Creating Layer pool1
-I0906 13:58:16.224534 16515 net.cpp:411] pool1 <- norm1
-I0906 13:58:16.224547 16515 net.cpp:369] pool1 -> pool1
-I0906 13:58:16.224558 16515 net.cpp:121] Setting up pool1
-I0906 13:58:16.224576 16515 net.cpp:128] Top shape: 50 96 27 27 (3499200)
-I0906 13:58:16.224581 16515 net.cpp:134] Memory required for data: 219154800
-I0906 13:58:16.224586 16515 layer_factory.hpp:74] Creating layer conv2
-I0906 13:58:16.224601 16515 net.cpp:91] Creating Layer conv2
-I0906 13:58:16.224606 16515 net.cpp:411] conv2 <- pool1
-I0906 13:58:16.224620 16515 net.cpp:369] conv2 -> conv2
-I0906 13:58:16.224632 16515 net.cpp:121] Setting up conv2
-I0906 13:58:16.264878 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:58:16.264889 16515 net.cpp:134] Memory required for data: 256479600
-I0906 13:58:16.264916 16515 layer_factory.hpp:74] Creating layer relu2
-I0906 13:58:16.264937 16515 net.cpp:91] Creating Layer relu2
-I0906 13:58:16.264946 16515 net.cpp:411] relu2 <- conv2
-I0906 13:58:16.264966 16515 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:58:16.264978 16515 net.cpp:121] Setting up relu2
-I0906 13:58:16.264987 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:58:16.264991 16515 net.cpp:134] Memory required for data: 293804400
-I0906 13:58:16.264997 16515 layer_factory.hpp:74] Creating layer norm2
-I0906 13:58:16.265015 16515 net.cpp:91] Creating Layer norm2
-I0906 13:58:16.265022 16515 net.cpp:411] norm2 <- conv2
-I0906 13:58:16.265035 16515 net.cpp:369] norm2 -> norm2
-I0906 13:58:16.265050 16515 net.cpp:121] Setting up norm2
-I0906 13:58:16.265072 16515 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:58:16.265077 16515 net.cpp:134] Memory required for data: 331129200
-I0906 13:58:16.265082 16515 layer_factory.hpp:74] Creating layer pool2
-I0906 13:58:16.265097 16515 net.cpp:91] Creating Layer pool2
-I0906 13:58:16.265103 16515 net.cpp:411] pool2 <- norm2
-I0906 13:58:16.265116 16515 net.cpp:369] pool2 -> pool2
-I0906 13:58:16.265127 16515 net.cpp:121] Setting up pool2
-I0906 13:58:16.265149 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:58:16.265153 16515 net.cpp:134] Memory required for data: 339782000
-I0906 13:58:16.265158 16515 layer_factory.hpp:74] Creating layer conv3
-I0906 13:58:16.265179 16515 net.cpp:91] Creating Layer conv3
-I0906 13:58:16.265184 16515 net.cpp:411] conv3 <- pool2
-I0906 13:58:16.265200 16515 net.cpp:369] conv3 -> conv3
-I0906 13:58:16.265213 16515 net.cpp:121] Setting up conv3
-I0906 13:58:16.312928 16520 data_layer.cpp:120] Prefetch batch: 93 ms.
-I0906 13:58:16.312959 16520 data_layer.cpp:121]      Read time: 12.075 ms.
-I0906 13:58:16.312966 16520 data_layer.cpp:122] Transform time: 80.513 ms.
-I0906 13:58:16.381564 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:58:16.381587 16515 net.cpp:134] Memory required for data: 352761200
-I0906 13:58:16.381628 16515 layer_factory.hpp:74] Creating layer relu3
-I0906 13:58:16.381660 16515 net.cpp:91] Creating Layer relu3
-I0906 13:58:16.381675 16515 net.cpp:411] relu3 <- conv3
-I0906 13:58:16.381700 16515 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:58:16.381717 16515 net.cpp:121] Setting up relu3
-I0906 13:58:16.381726 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:58:16.381731 16515 net.cpp:134] Memory required for data: 365740400
-I0906 13:58:16.381734 16515 layer_factory.hpp:74] Creating layer conv4
-I0906 13:58:16.381762 16515 net.cpp:91] Creating Layer conv4
-I0906 13:58:16.381767 16515 net.cpp:411] conv4 <- conv3
-I0906 13:58:16.381783 16515 net.cpp:369] conv4 -> conv4
-I0906 13:58:16.381798 16515 net.cpp:121] Setting up conv4
-I0906 13:58:16.468471 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:58:16.468492 16515 net.cpp:134] Memory required for data: 378719600
-I0906 13:58:16.468518 16515 layer_factory.hpp:74] Creating layer relu4
-I0906 13:58:16.468550 16515 net.cpp:91] Creating Layer relu4
-I0906 13:58:16.468605 16515 net.cpp:411] relu4 <- conv4
-I0906 13:58:16.468633 16515 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:58:16.468649 16515 net.cpp:121] Setting up relu4
-I0906 13:58:16.468658 16515 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:58:16.468662 16515 net.cpp:134] Memory required for data: 391698800
-I0906 13:58:16.468667 16515 layer_factory.hpp:74] Creating layer conv5
-I0906 13:58:16.468694 16515 net.cpp:91] Creating Layer conv5
-I0906 13:58:16.468700 16515 net.cpp:411] conv5 <- conv4
-I0906 13:58:16.468716 16515 net.cpp:369] conv5 -> conv5
-I0906 13:58:16.468731 16515 net.cpp:121] Setting up conv5
-I0906 13:58:16.526487 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:58:16.526507 16515 net.cpp:134] Memory required for data: 400351600
-I0906 13:58:16.526547 16515 layer_factory.hpp:74] Creating layer relu5
-I0906 13:58:16.526577 16515 net.cpp:91] Creating Layer relu5
-I0906 13:58:16.526590 16515 net.cpp:411] relu5 <- conv5
-I0906 13:58:16.526614 16515 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:58:16.526630 16515 net.cpp:121] Setting up relu5
-I0906 13:58:16.526639 16515 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:58:16.526643 16515 net.cpp:134] Memory required for data: 409004400
-I0906 13:58:16.526648 16515 layer_factory.hpp:74] Creating layer pool5
-I0906 13:58:16.526676 16515 net.cpp:91] Creating Layer pool5
-I0906 13:58:16.526682 16515 net.cpp:411] pool5 <- conv5
-I0906 13:58:16.526696 16515 net.cpp:369] pool5 -> pool5
-I0906 13:58:16.526710 16515 net.cpp:121] Setting up pool5
-I0906 13:58:16.526731 16515 net.cpp:128] Top shape: 50 256 6 6 (460800)
-I0906 13:58:16.526734 16515 net.cpp:134] Memory required for data: 410847600
-I0906 13:58:16.526739 16515 layer_factory.hpp:74] Creating layer fc6
-I0906 13:58:16.526762 16515 net.cpp:91] Creating Layer fc6
-I0906 13:58:16.526767 16515 net.cpp:411] fc6 <- pool5
-I0906 13:58:16.526782 16515 net.cpp:369] fc6 -> fc6
-I0906 13:58:16.526794 16515 net.cpp:121] Setting up fc6
-I0906 13:58:21.365124 16515 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:58:21.365149 16515 net.cpp:134] Memory required for data: 411666800
-I0906 13:58:21.365176 16515 layer_factory.hpp:74] Creating layer relu6
-I0906 13:58:21.365211 16515 net.cpp:91] Creating Layer relu6
-I0906 13:58:21.365226 16515 net.cpp:411] relu6 <- fc6
-I0906 13:58:21.365250 16515 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:58:21.365267 16515 net.cpp:121] Setting up relu6
-I0906 13:58:21.365277 16515 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:58:21.365280 16515 net.cpp:134] Memory required for data: 412486000
-I0906 13:58:21.365285 16515 layer_factory.hpp:74] Creating layer fc7
-I0906 13:58:21.365309 16515 net.cpp:91] Creating Layer fc7
-I0906 13:58:21.365314 16515 net.cpp:411] fc7 <- fc6
-I0906 13:58:21.365330 16515 net.cpp:369] fc7 -> fc7
-I0906 13:58:21.365345 16515 net.cpp:121] Setting up fc7
-I0906 13:58:23.510701 16515 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:58:23.510725 16515 net.cpp:134] Memory required for data: 413305200
-I0906 13:58:23.510752 16515 layer_factory.hpp:74] Creating layer relu7
-I0906 13:58:23.510785 16515 net.cpp:91] Creating Layer relu7
-I0906 13:58:23.510800 16515 net.cpp:411] relu7 <- fc7
-I0906 13:58:23.510828 16515 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:58:23.510844 16515 net.cpp:121] Setting up relu7
-I0906 13:58:23.510854 16515 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:58:23.510857 16515 net.cpp:134] Memory required for data: 414124400
-I0906 13:58:23.510862 16515 layer_factory.hpp:74] Creating layer fc8
-I0906 13:58:23.510885 16515 net.cpp:91] Creating Layer fc8
-I0906 13:58:23.510890 16515 net.cpp:411] fc8 <- fc7
-I0906 13:58:23.510906 16515 net.cpp:369] fc8 -> fc8
-I0906 13:58:23.510932 16515 net.cpp:121] Setting up fc8
-I0906 13:58:24.034812 16515 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:58:24.034833 16515 net.cpp:134] Memory required for data: 414324400
-I0906 13:58:24.034860 16515 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
-I0906 13:58:24.034893 16515 net.cpp:91] Creating Layer fc8_fc8_0_split
-I0906 13:58:24.034958 16515 net.cpp:411] fc8_fc8_0_split <- fc8
-I0906 13:58:24.034988 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
-I0906 13:58:24.035012 16515 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
-I0906 13:58:24.035023 16515 net.cpp:121] Setting up fc8_fc8_0_split
-I0906 13:58:24.035040 16515 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:58:24.035046 16515 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:58:24.035050 16515 net.cpp:134] Memory required for data: 414724400
-I0906 13:58:24.035055 16515 layer_factory.hpp:74] Creating layer accuracy
-I0906 13:58:24.035086 16515 net.cpp:91] Creating Layer accuracy
-I0906 13:58:24.035092 16515 net.cpp:411] accuracy <- fc8_fc8_0_split_0
-I0906 13:58:24.035104 16515 net.cpp:411] accuracy <- label_data_1_split_0
-I0906 13:58:24.035115 16515 net.cpp:369] accuracy -> accuracy
-I0906 13:58:24.035126 16515 net.cpp:121] Setting up accuracy
-I0906 13:58:24.035143 16515 net.cpp:128] Top shape: (1)
-I0906 13:58:24.035147 16515 net.cpp:134] Memory required for data: 414724404
-I0906 13:58:24.035152 16515 layer_factory.hpp:74] Creating layer loss
-I0906 13:58:24.035163 16515 net.cpp:91] Creating Layer loss
-I0906 13:58:24.035168 16515 net.cpp:411] loss <- fc8_fc8_0_split_1
-I0906 13:58:24.035179 16515 net.cpp:411] loss <- label_data_1_split_1
-I0906 13:58:24.035190 16515 net.cpp:369] loss -> loss
-I0906 13:58:24.035202 16515 net.cpp:121] Setting up loss
-I0906 13:58:24.035212 16515 layer_factory.hpp:74] Creating layer loss
-I0906 13:58:24.035562 16515 net.cpp:128] Top shape: (1)
-I0906 13:58:24.035567 16515 net.cpp:130]     with loss weight 1
-I0906 13:58:24.035583 16515 net.cpp:134] Memory required for data: 414724408
-I0906 13:58:24.035591 16515 net.cpp:193] loss needs backward computation.
-I0906 13:58:24.035598 16515 net.cpp:195] accuracy does not need backward computation.
-I0906 13:58:24.035605 16515 net.cpp:193] fc8_fc8_0_split needs backward computation.
-I0906 13:58:24.035610 16515 net.cpp:193] fc8 needs backward computation.
-I0906 13:58:24.035616 16515 net.cpp:193] relu7 needs backward computation.
-I0906 13:58:24.035621 16515 net.cpp:193] fc7 needs backward computation.
-I0906 13:58:24.035627 16515 net.cpp:193] relu6 needs backward computation.
-I0906 13:58:24.035634 16515 net.cpp:193] fc6 needs backward computation.
-I0906 13:58:24.035640 16515 net.cpp:193] pool5 needs backward computation.
-I0906 13:58:24.035645 16515 net.cpp:193] relu5 needs backward computation.
-I0906 13:58:24.035651 16515 net.cpp:193] conv5 needs backward computation.
-I0906 13:58:24.035656 16515 net.cpp:193] relu4 needs backward computation.
-I0906 13:58:24.035662 16515 net.cpp:193] conv4 needs backward computation.
-I0906 13:58:24.035668 16515 net.cpp:193] relu3 needs backward computation.
-I0906 13:58:24.035673 16515 net.cpp:193] conv3 needs backward computation.
-I0906 13:58:24.035679 16515 net.cpp:193] pool2 needs backward computation.
-I0906 13:58:24.035686 16515 net.cpp:193] norm2 needs backward computation.
-I0906 13:58:24.035692 16515 net.cpp:193] relu2 needs backward computation.
-I0906 13:58:24.035697 16515 net.cpp:193] conv2 needs backward computation.
-I0906 13:58:24.035703 16515 net.cpp:193] pool1 needs backward computation.
-I0906 13:58:24.035709 16515 net.cpp:193] norm1 needs backward computation.
-I0906 13:58:24.035715 16515 net.cpp:193] relu1 needs backward computation.
-I0906 13:58:24.035720 16515 net.cpp:193] conv1 needs backward computation.
-I0906 13:58:24.035727 16515 net.cpp:195] label_data_1_split does not need backward computation.
-I0906 13:58:24.035734 16515 net.cpp:195] data does not need backward computation.
-I0906 13:58:24.035739 16515 net.cpp:236] This network produces output accuracy
-I0906 13:58:24.035745 16515 net.cpp:236] This network produces output loss
-I0906 13:58:24.035781 16515 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:58:24.035796 16515 net.cpp:248] Network initialization done.
-I0906 13:58:24.035799 16515 net.cpp:249] Memory required for data: 414724408
-I0906 13:58:24.036000 16515 solver.cpp:53] Solver scaffolding done.
-I0906 13:58:24.036130 16515 solver.cpp
\ No newline at end of file
diff --git a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537 b/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
deleted file mode 100644
index d142f7c0..00000000
--- a/log/caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
+++ /dev/null
@@ -1,1208 +0,0 @@
-Log file created at: 2015/09/06 13:58:55
-Running on machine: AMD-RESEARCH
-Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
-I0906 13:58:55.707435 16537 caffe.cpp:114] Use GPU with device ID 0
-I0906 13:58:55.745967 16537 device.cpp:230] Number of platforms found:1
-I0906 13:58:55.746011 16537 device.cpp:262] 	CL_PLATFORM_NAME	AMD Accelerated Parallel Processing
-I0906 13:58:55.746028 16537 device.cpp:262] 	CL_PLATFORM_PROFILE	FULL_PROFILE
-I0906 13:58:55.746036 16537 device.cpp:262] 	CL_PLATFORM_VERSION	OpenCL 2.0 AMD-APP.internal (1644.0)
-I0906 13:58:55.746042 16537 device.cpp:262] 	CL_PLATFORM_VENDOR	Advanced Micro Devices, Inc.
-I0906 13:58:55.746048 16537 device.cpp:262] 	CL_PLATFORM_EXTENSIONS	cl_khr_icd cl_amd_object_metadata cl_amd_event_callback cl_amd_offline_devices 
-I0906 13:58:55.746059 16537 device.cpp:286] Number of devices found:1
-I0906 13:58:55.746064 16537 device.cpp:288] 	DeviceID:	0x18262f0
-I0906 13:58:55.746088 16537 device.cpp:366] 	 Device Type:	CL_DEVICE_TYPE_GPU
-I0906 13:58:55.746098 16537 device.cpp:393] 	Is it integrated GPU?:	0
-I0906 13:58:55.746105 16537 device.cpp:393] 	Max clock frequency MHz:	930
-I0906 13:58:55.746111 16537 device.cpp:393] 	Host-Device unified mem:	0
-I0906 13:58:55.746117 16537 device.cpp:393] 	ECC support:	0
-I0906 13:58:55.746124 16537 device.cpp:393] 	Endian little:	1
-I0906 13:58:55.746130 16537 device.cpp:393] 	Max compute units:	44
-I0906 13:58:55.746136 16537 device.cpp:393] 	Max work group size:	256
-I0906 13:58:55.746145 16537 device.cpp:393] 	Max work item dimensions:	3
-I0906 13:58:55.746151 16537 device.cpp:393] 	Max work item sizes:	0x100
-I0906 13:58:55.746160 16537 device.cpp:389] 	 CL_DEVICE_QUEUE_PROPERTIES:	CL_QUEUE_PROFILING_ENABLE
-I0906 13:58:55.746167 16537 device.cpp:378] 	 CL_DEVICE_EXECUTION_CAPABILITIES:	CL_EXEC_KERNEL
-I0906 13:58:55.746173 16537 device.cpp:393] 	Max mem alloc size:	4244635648
-I0906 13:58:55.746179 16537 device.cpp:393] 	Global mem size:	16878927872
-I0906 13:58:55.746186 16537 device.cpp:393] 	Local mem size:	32768
-I0906 13:58:55.746198 16537 device.cpp:96] Picked device type : GPU 0
-I0906 13:58:58.131669 16537 device.cpp:152] Build Program
-I0906 13:58:58.131891 16537 caffe.cpp:122] Starting Optimization
-I0906 13:58:58.132027 16537 solver.cpp:40] Initializing solver from parameters: 
-test_iter: 1
-test_interval: 1000
-base_lr: 0.01
-display: 1
-max_iter: 10
-lr_policy: "step"
-gamma: 0.1
-momentum: 0.9
-weight_decay: 0.0005
-stepsize: 100000
-snapshot: 10000
-snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
-solver_mode: GPU
-net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
-I0906 13:58:58.132150 16537 solver.cpp:81] Creating training net from net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:58:58.133236 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer data
-I0906 13:58:58.133285 16537 net.cpp:288] The NetState phase (0) differed from the phase (1) specified by a rule in layer accuracy
-I0906 13:58:58.133460 16537 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TRAIN
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:58:58.133894 16537 net.cpp:68] Memory required for data: 0
-I0906 13:58:58.134050 16537 layer_factory.hpp:74] Creating layer data
-I0906 13:58:58.134104 16537 net.cpp:91] Creating Layer data
-I0906 13:58:58.134125 16537 net.cpp:369] data -> data
-I0906 13:58:58.134229 16537 net.cpp:369] data -> label
-I0906 13:58:58.134253 16537 net.cpp:121] Setting up data
-I0906 13:58:58.134266 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:58:58.143668 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb
-I0906 13:58:58.144057 16537 data_layer.cpp:53] output data size: 100,3,227,227
-I0906 13:58:58.175259 16537 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:58:58.175475 16537 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:58:58.175534 16537 net.cpp:128] Top shape: 100 3 227 227 (15458700)
-I0906 13:58:58.175544 16537 net.cpp:128] Top shape: 100 (100)
-I0906 13:58:58.175547 16537 net.cpp:134] Memory required for data: 61835200
-I0906 13:58:58.175582 16537 layer_factory.hpp:74] Creating layer conv1
-I0906 13:58:58.175659 16537 net.cpp:91] Creating Layer conv1
-I0906 13:58:58.175683 16537 net.cpp:411] conv1 <- data
-I0906 13:58:58.175760 16537 net.cpp:369] conv1 -> conv1
-I0906 13:58:58.175793 16537 net.cpp:121] Setting up conv1
-I0906 13:58:58.180706 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:58.180712 16537 net.cpp:134] Memory required for data: 177995200
-I0906 13:58:58.180752 16537 layer_factory.hpp:74] Creating layer relu1
-I0906 13:58:58.180774 16537 net.cpp:91] Creating Layer relu1
-I0906 13:58:58.180780 16537 net.cpp:411] relu1 <- conv1
-I0906 13:58:58.180794 16537 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:58:58.180804 16537 net.cpp:121] Setting up relu1
-I0906 13:58:58.180811 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:58.180815 16537 net.cpp:134] Memory required for data: 294155200
-I0906 13:58:58.180821 16537 layer_factory.hpp:74] Creating layer norm1
-I0906 13:58:58.180848 16537 net.cpp:91] Creating Layer norm1
-I0906 13:58:58.180855 16537 net.cpp:411] norm1 <- conv1
-I0906 13:58:58.180867 16537 net.cpp:369] norm1 -> norm1
-I0906 13:58:58.180881 16537 net.cpp:121] Setting up norm1
-I0906 13:58:58.180905 16537 net.cpp:128] Top shape: 100 96 55 55 (29040000)
-I0906 13:58:58.180909 16537 net.cpp:134] Memory required for data: 410315200
-I0906 13:58:58.180915 16537 layer_factory.hpp:74] Creating layer pool1
-I0906 13:58:58.180938 16537 net.cpp:91] Creating Layer pool1
-I0906 13:58:58.180944 16537 net.cpp:411] pool1 <- norm1
-I0906 13:58:58.180958 16537 net.cpp:369] pool1 -> pool1
-I0906 13:58:58.180970 16537 net.cpp:121] Setting up pool1
-I0906 13:58:58.180999 16537 net.cpp:128] Top shape: 100 96 27 27 (6998400)
-I0906 13:58:58.181004 16537 net.cpp:134] Memory required for data: 438308800
-I0906 13:58:58.181008 16537 layer_factory.hpp:74] Creating layer conv2
-I0906 13:58:58.181023 16537 net.cpp:91] Creating Layer conv2
-I0906 13:58:58.181030 16537 net.cpp:411] conv2 <- pool1
-I0906 13:58:58.181044 16537 net.cpp:369] conv2 -> conv2
-I0906 13:58:58.181056 16537 net.cpp:121] Setting up conv2
-I0906 13:58:58.221200 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:58.221215 16537 net.cpp:134] Memory required for data: 512958400
-I0906 13:58:58.221245 16537 layer_factory.hpp:74] Creating layer relu2
-I0906 13:58:58.221267 16537 net.cpp:91] Creating Layer relu2
-I0906 13:58:58.221277 16537 net.cpp:411] relu2 <- conv2
-I0906 13:58:58.221297 16537 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:58:58.221312 16537 net.cpp:121] Setting up relu2
-I0906 13:58:58.221320 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:58.221324 16537 net.cpp:134] Memory required for data: 587608000
-I0906 13:58:58.221329 16537 layer_factory.hpp:74] Creating layer norm2
-I0906 13:58:58.221346 16537 net.cpp:91] Creating Layer norm2
-I0906 13:58:58.221352 16537 net.cpp:411] norm2 <- conv2
-I0906 13:58:58.221366 16537 net.cpp:369] norm2 -> norm2
-I0906 13:58:58.221379 16537 net.cpp:121] Setting up norm2
-I0906 13:58:58.221397 16537 net.cpp:128] Top shape: 100 256 27 27 (18662400)
-I0906 13:58:58.221402 16537 net.cpp:134] Memory required for data: 662257600
-I0906 13:58:58.221407 16537 layer_factory.hpp:74] Creating layer pool2
-I0906 13:58:58.221429 16537 net.cpp:91] Creating Layer pool2
-I0906 13:58:58.221436 16537 net.cpp:411] pool2 <- norm2
-I0906 13:58:58.221448 16537 net.cpp:369] pool2 -> pool2
-I0906 13:58:58.221460 16537 net.cpp:121] Setting up pool2
-I0906 13:58:58.221480 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:58.221484 16537 net.cpp:134] Memory required for data: 679563200
-I0906 13:58:58.221534 16537 layer_factory.hpp:74] Creating layer conv3
-I0906 13:58:58.221555 16537 net.cpp:91] Creating Layer conv3
-I0906 13:58:58.221561 16537 net.cpp:411] conv3 <- pool2
-I0906 13:58:58.221576 16537 net.cpp:369] conv3 -> conv3
-I0906 13:58:58.221592 16537 net.cpp:121] Setting up conv3
-I0906 13:58:58.338774 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:58.338798 16537 net.cpp:134] Memory required for data: 705521600
-I0906 13:58:58.338841 16537 layer_factory.hpp:74] Creating layer relu3
-I0906 13:58:58.338876 16537 net.cpp:91] Creating Layer relu3
-I0906 13:58:58.338891 16537 net.cpp:411] relu3 <- conv3
-I0906 13:58:58.338918 16537 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:58:58.338935 16537 net.cpp:121] Setting up relu3
-I0906 13:58:58.338944 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:58.338948 16537 net.cpp:134] Memory required for data: 731480000
-I0906 13:58:58.338953 16537 layer_factory.hpp:74] Creating layer conv4
-I0906 13:58:58.338979 16537 net.cpp:91] Creating Layer conv4
-I0906 13:58:58.338985 16537 net.cpp:411] conv4 <- conv3
-I0906 13:58:58.339002 16537 net.cpp:369] conv4 -> conv4
-I0906 13:58:58.339017 16537 net.cpp:121] Setting up conv4
-I0906 13:58:58.369153 16541 data_layer.cpp:120] Prefetch batch: 193 ms.
-I0906 13:58:58.369201 16541 data_layer.cpp:121]      Read time: 23.991 ms.
-I0906 13:58:58.369210 16541 data_layer.cpp:122] Transform time: 167.322 ms.
-I0906 13:58:58.426654 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:58.426676 16537 net.cpp:134] Memory required for data: 757438400
-I0906 13:58:58.426703 16537 layer_factory.hpp:74] Creating layer relu4
-I0906 13:58:58.426735 16537 net.cpp:91] Creating Layer relu4
-I0906 13:58:58.426749 16537 net.cpp:411] relu4 <- conv4
-I0906 13:58:58.426776 16537 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:58:58.426794 16537 net.cpp:121] Setting up relu4
-I0906 13:58:58.426802 16537 net.cpp:128] Top shape: 100 384 13 13 (6489600)
-I0906 13:58:58.426806 16537 net.cpp:134] Memory required for data: 783396800
-I0906 13:58:58.426811 16537 layer_factory.hpp:74] Creating layer conv5
-I0906 13:58:58.426838 16537 net.cpp:91] Creating Layer conv5
-I0906 13:58:58.426843 16537 net.cpp:411] conv5 <- conv4
-I0906 13:58:58.426858 16537 net.cpp:369] conv5 -> conv5
-I0906 13:58:58.426873 16537 net.cpp:121] Setting up conv5
-I0906 13:58:58.484124 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:58.484143 16537 net.cpp:134] Memory required for data: 800702400
-I0906 13:58:58.484182 16537 layer_factory.hpp:74] Creating layer relu5
-I0906 13:58:58.484212 16537 net.cpp:91] Creating Layer relu5
-I0906 13:58:58.484225 16537 net.cpp:411] relu5 <- conv5
-I0906 13:58:58.484251 16537 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:58:58.484266 16537 net.cpp:121] Setting up relu5
-I0906 13:58:58.484274 16537 net.cpp:128] Top shape: 100 256 13 13 (4326400)
-I0906 13:58:58.484278 16537 net.cpp:134] Memory required for data: 818008000
-I0906 13:58:58.484282 16537 layer_factory.hpp:74] Creating layer pool5
-I0906 13:58:58.484302 16537 net.cpp:91] Creating Layer pool5
-I0906 13:58:58.484308 16537 net.cpp:411] pool5 <- conv5
-I0906 13:58:58.484321 16537 net.cpp:369] pool5 -> pool5
-I0906 13:58:58.484335 16537 net.cpp:121] Setting up pool5
-I0906 13:58:58.484355 16537 net.cpp:128] Top shape: 100 256 6 6 (921600)
-I0906 13:58:58.484359 16537 net.cpp:134] Memory required for data: 821694400
-I0906 13:58:58.484364 16537 layer_factory.hpp:74] Creating layer fc6
-I0906 13:58:58.484400 16537 net.cpp:91] Creating Layer fc6
-I0906 13:58:58.484405 16537 net.cpp:411] fc6 <- pool5
-I0906 13:58:58.484421 16537 net.cpp:369] fc6 -> fc6
-I0906 13:58:58.484434 16537 net.cpp:121] Setting up fc6
-I0906 13:59:03.394265 16537 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:59:03.394289 16537 net.cpp:134] Memory required for data: 823332800
-I0906 13:59:03.394316 16537 layer_factory.hpp:74] Creating layer relu6
-I0906 13:59:03.394362 16537 net.cpp:91] Creating Layer relu6
-I0906 13:59:03.394378 16537 net.cpp:411] relu6 <- fc6
-I0906 13:59:03.394405 16537 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:59:03.394472 16537 net.cpp:121] Setting up relu6
-I0906 13:59:03.394482 16537 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:59:03.394486 16537 net.cpp:134] Memory required for data: 824971200
-I0906 13:59:03.394492 16537 layer_factory.hpp:74] Creating layer fc7
-I0906 13:59:03.394515 16537 net.cpp:91] Creating Layer fc7
-I0906 13:59:03.394521 16537 net.cpp:411] fc7 <- fc6
-I0906 13:59:03.394537 16537 net.cpp:369] fc7 -> fc7
-I0906 13:59:03.394558 16537 net.cpp:121] Setting up fc7
-I0906 13:59:05.554731 16537 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:59:05.554755 16537 net.cpp:134] Memory required for data: 826609600
-I0906 13:59:05.554782 16537 layer_factory.hpp:74] Creating layer relu7
-I0906 13:59:05.554815 16537 net.cpp:91] Creating Layer relu7
-I0906 13:59:05.554829 16537 net.cpp:411] relu7 <- fc7
-I0906 13:59:05.554855 16537 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:59:05.554870 16537 net.cpp:121] Setting up relu7
-I0906 13:59:05.554879 16537 net.cpp:128] Top shape: 100 4096 (409600)
-I0906 13:59:05.554883 16537 net.cpp:134] Memory required for data: 828248000
-I0906 13:59:05.554888 16537 layer_factory.hpp:74] Creating layer fc8
-I0906 13:59:05.554911 16537 net.cpp:91] Creating Layer fc8
-I0906 13:59:05.554916 16537 net.cpp:411] fc8 <- fc7
-I0906 13:59:05.554932 16537 net.cpp:369] fc8 -> fc8
-I0906 13:59:05.554946 16537 net.cpp:121] Setting up fc8
-I0906 13:59:06.080322 16537 net.cpp:128] Top shape: 100 1000 (100000)
-I0906 13:59:06.080343 16537 net.cpp:134] Memory required for data: 828648000
-I0906 13:59:06.080370 16537 layer_factory.hpp:74] Creating layer loss
-I0906 13:59:06.080420 16537 net.cpp:91] Creating Layer loss
-I0906 13:59:06.080435 16537 net.cpp:411] loss <- fc8
-I0906 13:59:06.080457 16537 net.cpp:411] loss <- label
-I0906 13:59:06.080476 16537 net.cpp:369] loss -> loss
-I0906 13:59:06.080497 16537 net.cpp:121] Setting up loss
-I0906 13:59:06.080515 16537 layer_factory.hpp:74] Creating layer loss
-I0906 13:59:06.081025 16537 net.cpp:128] Top shape: (1)
-I0906 13:59:06.081030 16537 net.cpp:130]     with loss weight 1
-I0906 13:59:06.081048 16537 net.cpp:134] Memory required for data: 828648004
-I0906 13:59:06.081055 16537 net.cpp:193] loss needs backward computation.
-I0906 13:59:06.081063 16537 net.cpp:193] fc8 needs backward computation.
-I0906 13:59:06.081069 16537 net.cpp:193] relu7 needs backward computation.
-I0906 13:59:06.081074 16537 net.cpp:193] fc7 needs backward computation.
-I0906 13:59:06.081080 16537 net.cpp:193] relu6 needs backward computation.
-I0906 13:59:06.081086 16537 net.cpp:193] fc6 needs backward computation.
-I0906 13:59:06.081091 16537 net.cpp:193] pool5 needs backward computation.
-I0906 13:59:06.081097 16537 net.cpp:193] relu5 needs backward computation.
-I0906 13:59:06.081102 16537 net.cpp:193] conv5 needs backward computation.
-I0906 13:59:06.081109 16537 net.cpp:193] relu4 needs backward computation.
-I0906 13:59:06.081114 16537 net.cpp:193] conv4 needs backward computation.
-I0906 13:59:06.081120 16537 net.cpp:193] relu3 needs backward computation.
-I0906 13:59:06.081125 16537 net.cpp:193] conv3 needs backward computation.
-I0906 13:59:06.081132 16537 net.cpp:193] pool2 needs backward computation.
-I0906 13:59:06.081138 16537 net.cpp:193] norm2 needs backward computation.
-I0906 13:59:06.081145 16537 net.cpp:193] relu2 needs backward computation.
-I0906 13:59:06.081149 16537 net.cpp:193] conv2 needs backward computation.
-I0906 13:59:06.081156 16537 net.cpp:193] pool1 needs backward computation.
-I0906 13:59:06.081161 16537 net.cpp:193] norm1 needs backward computation.
-I0906 13:59:06.081167 16537 net.cpp:193] relu1 needs backward computation.
-I0906 13:59:06.081173 16537 net.cpp:193] conv1 needs backward computation.
-I0906 13:59:06.081181 16537 net.cpp:195] data does not need backward computation.
-I0906 13:59:06.081187 16537 net.cpp:236] This network produces output loss
-I0906 13:59:06.081223 16537 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:59:06.081238 16537 net.cpp:248] Network initialization done.
-I0906 13:59:06.081241 16537 net.cpp:249] Memory required for data: 828648004
-I0906 13:59:06.082168 16537 solver.cpp:165] Creating test net (#0) specified by net file: models/bvlc_alexnet/train_val_without_dropout.prototxt
-I0906 13:59:06.082299 16537 net.cpp:288] The NetState phase (1) differed from the phase (0) specified by a rule in layer data
-I0906 13:59:06.082527 16537 net.cpp:43] Initializing net from parameters: 
-name: "AlexNet"
-state {
-  phase: TEST
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 227
-    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
-  }
-  data_param {
-    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "norm1"
-  type: "LRN"
-  bottom: "conv1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "norm1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu2"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "norm2"
-  type: "LRN"
-  bottom: "conv2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "norm2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "pool2"
-  top: "conv3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "relu3"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu4"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu5"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "pool5"
-  type: "Pooling"
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "fc6"
-  type: "InnerProduct"
-  bottom: "pool5"
-  top: "fc6"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu6"
-  type: "ReLU"
-  bottom: "fc6"
-  top: "fc6"
-}
-layer {
-  name: "fc7"
-  type: "InnerProduct"
-  bottom: "fc6"
-  top: "fc7"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "relu7"
-  type: "ReLU"
-  bottom: "fc7"
-  top: "fc7"
-}
-layer {
-  name: "fc8"
-  type: "InnerProduct"
-  bottom: "fc7"
-  top: "fc8"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "fc8"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "fc8"
-  bottom: "label"
-  top: "loss"
-}
-I0906 13:59:06.082866 16537 net.cpp:68] Memory required for data: 0
-I0906 13:59:06.082913 16537 layer_factory.hpp:74] Creating layer data
-I0906 13:59:06.082934 16537 net.cpp:91] Creating Layer data
-I0906 13:59:06.082944 16537 net.cpp:369] data -> data
-I0906 13:59:06.082967 16537 net.cpp:369] data -> label
-I0906 13:59:06.082981 16537 net.cpp:121] Setting up data
-I0906 13:59:06.082988 16537 data_transformer.cpp:22] Loading mean file from: /home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto
-I0906 13:59:06.091397 16537 db_lmdb.cpp:22] Opened lmdb /home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb
-I0906 13:59:06.091647 16537 data_layer.cpp:53] output data size: 50,3,227,227
-I0906 13:59:06.107939 16537 base_data_layer.cpp:43] Initializing prefetch
-I0906 13:59:06.108054 16537 base_data_layer.cpp:45] Prefetch initialized.
-I0906 13:59:06.108088 16537 net.cpp:128] Top shape: 50 3 227 227 (7729350)
-I0906 13:59:06.108098 16537 net.cpp:128] Top shape: 50 (50)
-I0906 13:59:06.108101 16537 net.cpp:134] Memory required for data: 30917600
-I0906 13:59:06.108135 16537 layer_factory.hpp:74] Creating layer label_data_1_split
-I0906 13:59:06.108201 16537 net.cpp:91] Creating Layer label_data_1_split
-I0906 13:59:06.108216 16537 net.cpp:411] label_data_1_split <- label
-I0906 13:59:06.108259 16537 net.cpp:369] label_data_1_split -> label_data_1_split_0
-I0906 13:59:06.108306 16537 net.cpp:369] label_data_1_split -> label_data_1_split_1
-I0906 13:59:06.108319 16537 net.cpp:121] Setting up label_data_1_split
-I0906 13:59:06.108353 16537 net.cpp:128] Top shape: 50 (50)
-I0906 13:59:06.108361 16537 net.cpp:128] Top shape: 50 (50)
-I0906 13:59:06.108364 16537 net.cpp:134] Memory required for data: 30918000
-I0906 13:59:06.108369 16537 layer_factory.hpp:74] Creating layer conv1
-I0906 13:59:06.108403 16537 net.cpp:91] Creating Layer conv1
-I0906 13:59:06.108409 16537 net.cpp:411] conv1 <- data
-I0906 13:59:06.108425 16537 net.cpp:369] conv1 -> conv1
-I0906 13:59:06.108440 16537 net.cpp:121] Setting up conv1
-I0906 13:59:06.113059 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:59:06.113065 16537 net.cpp:134] Memory required for data: 88998000
-I0906 13:59:06.113085 16537 layer_factory.hpp:74] Creating layer relu1
-I0906 13:59:06.113097 16537 net.cpp:91] Creating Layer relu1
-I0906 13:59:06.113103 16537 net.cpp:411] relu1 <- conv1
-I0906 13:59:06.113116 16537 net.cpp:358] relu1 -> conv1 (in-place)
-I0906 13:59:06.113126 16537 net.cpp:121] Setting up relu1
-I0906 13:59:06.113134 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:59:06.113138 16537 net.cpp:134] Memory required for data: 147078000
-I0906 13:59:06.113143 16537 layer_factory.hpp:74] Creating layer norm1
-I0906 13:59:06.113163 16537 net.cpp:91] Creating Layer norm1
-I0906 13:59:06.113169 16537 net.cpp:411] norm1 <- conv1
-I0906 13:59:06.113183 16537 net.cpp:369] norm1 -> norm1
-I0906 13:59:06.113193 16537 net.cpp:121] Setting up norm1
-I0906 13:59:06.113212 16537 net.cpp:128] Top shape: 50 96 55 55 (14520000)
-I0906 13:59:06.113255 16537 net.cpp:134] Memory required for data: 205158000
-I0906 13:59:06.113260 16537 layer_factory.hpp:74] Creating layer pool1
-I0906 13:59:06.113277 16537 net.cpp:91] Creating Layer pool1
-I0906 13:59:06.113282 16537 net.cpp:411] pool1 <- norm1
-I0906 13:59:06.113296 16537 net.cpp:369] pool1 -> pool1
-I0906 13:59:06.113306 16537 net.cpp:121] Setting up pool1
-I0906 13:59:06.113325 16537 net.cpp:128] Top shape: 50 96 27 27 (3499200)
-I0906 13:59:06.113329 16537 net.cpp:134] Memory required for data: 219154800
-I0906 13:59:06.113334 16537 layer_factory.hpp:74] Creating layer conv2
-I0906 13:59:06.113348 16537 net.cpp:91] Creating Layer conv2
-I0906 13:59:06.113354 16537 net.cpp:411] conv2 <- pool1
-I0906 13:59:06.113369 16537 net.cpp:369] conv2 -> conv2
-I0906 13:59:06.113381 16537 net.cpp:121] Setting up conv2
-I0906 13:59:06.154265 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:59:06.154281 16537 net.cpp:134] Memory required for data: 256479600
-I0906 13:59:06.154316 16537 layer_factory.hpp:74] Creating layer relu2
-I0906 13:59:06.154345 16537 net.cpp:91] Creating Layer relu2
-I0906 13:59:06.154355 16537 net.cpp:411] relu2 <- conv2
-I0906 13:59:06.154374 16537 net.cpp:358] relu2 -> conv2 (in-place)
-I0906 13:59:06.154387 16537 net.cpp:121] Setting up relu2
-I0906 13:59:06.154397 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:59:06.154400 16537 net.cpp:134] Memory required for data: 293804400
-I0906 13:59:06.154405 16537 layer_factory.hpp:74] Creating layer norm2
-I0906 13:59:06.154427 16537 net.cpp:91] Creating Layer norm2
-I0906 13:59:06.154433 16537 net.cpp:411] norm2 <- conv2
-I0906 13:59:06.154446 16537 net.cpp:369] norm2 -> norm2
-I0906 13:59:06.154463 16537 net.cpp:121] Setting up norm2
-I0906 13:59:06.154484 16537 net.cpp:128] Top shape: 50 256 27 27 (9331200)
-I0906 13:59:06.154503 16537 net.cpp:134] Memory required for data: 331129200
-I0906 13:59:06.154508 16537 layer_factory.hpp:74] Creating layer pool2
-I0906 13:59:06.154525 16537 net.cpp:91] Creating Layer pool2
-I0906 13:59:06.154531 16537 net.cpp:411] pool2 <- norm2
-I0906 13:59:06.154544 16537 net.cpp:369] pool2 -> pool2
-I0906 13:59:06.154556 16537 net.cpp:121] Setting up pool2
-I0906 13:59:06.154573 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:59:06.154578 16537 net.cpp:134] Memory required for data: 339782000
-I0906 13:59:06.154583 16537 layer_factory.hpp:74] Creating layer conv3
-I0906 13:59:06.154604 16537 net.cpp:91] Creating Layer conv3
-I0906 13:59:06.154610 16537 net.cpp:411] conv3 <- pool2
-I0906 13:59:06.154625 16537 net.cpp:369] conv3 -> conv3
-I0906 13:59:06.154638 16537 net.cpp:121] Setting up conv3
-I0906 13:59:06.204232 16545 data_layer.cpp:120] Prefetch batch: 96 ms.
-I0906 13:59:06.204263 16545 data_layer.cpp:121]      Read time: 12.163 ms.
-I0906 13:59:06.204272 16545 data_layer.cpp:122] Transform time: 82.876 ms.
-I0906 13:59:06.270438 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:59:06.270459 16537 net.cpp:134] Memory required for data: 352761200
-I0906 13:59:06.270499 16537 layer_factory.hpp:74] Creating layer relu3
-I0906 13:59:06.270532 16537 net.cpp:91] Creating Layer relu3
-I0906 13:59:06.270546 16537 net.cpp:411] relu3 <- conv3
-I0906 13:59:06.270571 16537 net.cpp:358] relu3 -> conv3 (in-place)
-I0906 13:59:06.270587 16537 net.cpp:121] Setting up relu3
-I0906 13:59:06.270596 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:59:06.270601 16537 net.cpp:134] Memory required for data: 365740400
-I0906 13:59:06.270606 16537 layer_factory.hpp:74] Creating layer conv4
-I0906 13:59:06.270630 16537 net.cpp:91] Creating Layer conv4
-I0906 13:59:06.270637 16537 net.cpp:411] conv4 <- conv3
-I0906 13:59:06.270651 16537 net.cpp:369] conv4 -> conv4
-I0906 13:59:06.270666 16537 net.cpp:121] Setting up conv4
-I0906 13:59:06.357051 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:59:06.357074 16537 net.cpp:134] Memory required for data: 378719600
-I0906 13:59:06.357100 16537 layer_factory.hpp:74] Creating layer relu4
-I0906 13:59:06.357132 16537 net.cpp:91] Creating Layer relu4
-I0906 13:59:06.357184 16537 net.cpp:411] relu4 <- conv4
-I0906 13:59:06.357210 16537 net.cpp:358] relu4 -> conv4 (in-place)
-I0906 13:59:06.357226 16537 net.cpp:121] Setting up relu4
-I0906 13:59:06.357235 16537 net.cpp:128] Top shape: 50 384 13 13 (3244800)
-I0906 13:59:06.357239 16537 net.cpp:134] Memory required for data: 391698800
-I0906 13:59:06.357244 16537 layer_factory.hpp:74] Creating layer conv5
-I0906 13:59:06.357270 16537 net.cpp:91] Creating Layer conv5
-I0906 13:59:06.357276 16537 net.cpp:411] conv5 <- conv4
-I0906 13:59:06.357292 16537 net.cpp:369] conv5 -> conv5
-I0906 13:59:06.357308 16537 net.cpp:121] Setting up conv5
-I0906 13:59:06.414666 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:59:06.414685 16537 net.cpp:134] Memory required for data: 400351600
-I0906 13:59:06.414727 16537 layer_factory.hpp:74] Creating layer relu5
-I0906 13:59:06.414757 16537 net.cpp:91] Creating Layer relu5
-I0906 13:59:06.414770 16537 net.cpp:411] relu5 <- conv5
-I0906 13:59:06.414794 16537 net.cpp:358] relu5 -> conv5 (in-place)
-I0906 13:59:06.414808 16537 net.cpp:121] Setting up relu5
-I0906 13:59:06.414818 16537 net.cpp:128] Top shape: 50 256 13 13 (2163200)
-I0906 13:59:06.414820 16537 net.cpp:134] Memory required for data: 409004400
-I0906 13:59:06.414825 16537 layer_factory.hpp:74] Creating layer pool5
-I0906 13:59:06.414855 16537 net.cpp:91] Creating Layer pool5
-I0906 13:59:06.414860 16537 net.cpp:411] pool5 <- conv5
-I0906 13:59:06.414875 16537 net.cpp:369] pool5 -> pool5
-I0906 13:59:06.414888 16537 net.cpp:121] Setting up pool5
-I0906 13:59:06.414908 16537 net.cpp:128] Top shape: 50 256 6 6 (460800)
-I0906 13:59:06.414912 16537 net.cpp:134] Memory required for data: 410847600
-I0906 13:59:06.414917 16537 layer_factory.hpp:74] Creating layer fc6
-I0906 13:59:06.414938 16537 net.cpp:91] Creating Layer fc6
-I0906 13:59:06.414944 16537 net.cpp:411] fc6 <- pool5
-I0906 13:59:06.414959 16537 net.cpp:369] fc6 -> fc6
-I0906 13:59:06.414971 16537 net.cpp:121] Setting up fc6
-I0906 13:59:11.292778 16537 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:59:11.292801 16537 net.cpp:134] Memory required for data: 411666800
-I0906 13:59:11.292829 16537 layer_factory.hpp:74] Creating layer relu6
-I0906 13:59:11.292860 16537 net.cpp:91] Creating Layer relu6
-I0906 13:59:11.292876 16537 net.cpp:411] relu6 <- fc6
-I0906 13:59:11.292902 16537 net.cpp:358] relu6 -> fc6 (in-place)
-I0906 13:59:11.292918 16537 net.cpp:121] Setting up relu6
-I0906 13:59:11.292927 16537 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:59:11.292932 16537 net.cpp:134] Memory required for data: 412486000
-I0906 13:59:11.292937 16537 layer_factory.hpp:74] Creating layer fc7
-I0906 13:59:11.292958 16537 net.cpp:91] Creating Layer fc7
-I0906 13:59:11.292964 16537 net.cpp:411] fc7 <- fc6
-I0906 13:59:11.292980 16537 net.cpp:369] fc7 -> fc7
-I0906 13:59:11.292995 16537 net.cpp:121] Setting up fc7
-I0906 13:59:13.449043 16537 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:59:13.449066 16537 net.cpp:134] Memory required for data: 413305200
-I0906 13:59:13.449095 16537 layer_factory.hpp:74] Creating layer relu7
-I0906 13:59:13.449126 16537 net.cpp:91] Creating Layer relu7
-I0906 13:59:13.449141 16537 net.cpp:411] relu7 <- fc7
-I0906 13:59:13.449167 16537 net.cpp:358] relu7 -> fc7 (in-place)
-I0906 13:59:13.449182 16537 net.cpp:121] Setting up relu7
-I0906 13:59:13.449192 16537 net.cpp:128] Top shape: 50 4096 (204800)
-I0906 13:59:13.449195 16537 net.cpp:134] Memory required for data: 414124400
-I0906 13:59:13.449200 16537 layer_factory.hpp:74] Creating layer fc8
-I0906 13:59:13.449223 16537 net.cpp:91] Creating Layer fc8
-I0906 13:59:13.449229 16537 net.cpp:411] fc8 <- fc7
-I0906 13:59:13.449244 16537 net.cpp:369] fc8 -> fc8
-I0906 13:59:13.449270 16537 net.cpp:121] Setting up fc8
-I0906 13:59:13.974771 16537 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:59:13.974793 16537 net.cpp:134] Memory required for data: 414324400
-I0906 13:59:13.974820 16537 layer_factory.hpp:74] Creating layer fc8_fc8_0_split
-I0906 13:59:13.974851 16537 net.cpp:91] Creating Layer fc8_fc8_0_split
-I0906 13:59:13.974911 16537 net.cpp:411] fc8_fc8_0_split <- fc8
-I0906 13:59:13.974939 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_0
-I0906 13:59:13.974962 16537 net.cpp:369] fc8_fc8_0_split -> fc8_fc8_0_split_1
-I0906 13:59:13.974974 16537 net.cpp:121] Setting up fc8_fc8_0_split
-I0906 13:59:13.974992 16537 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:59:13.974998 16537 net.cpp:128] Top shape: 50 1000 (50000)
-I0906 13:59:13.975003 16537 net.cpp:134] Memory required for data: 414724400
-I0906 13:59:13.975006 16537 layer_factory.hpp:74] Creating layer accuracy
-I0906 13:59:13.975038 16537 net.cpp:91] Creating Layer accuracy
-I0906 13:59:13.975044 16537 net.cpp:411] accuracy <- fc8_fc8_0_split_0
-I0906 13:59:13.975054 16537 net.cpp:411] accuracy <- label_data_1_split_0
-I0906 13:59:13.975065 16537 net.cpp:369] accuracy -> accuracy
-I0906 13:59:13.975076 16537 net.cpp:121] Setting up accuracy
-I0906 13:59:13.975092 16537 net.cpp:128] Top shape: (1)
-I0906 13:59:13.975096 16537 net.cpp:134] Memory required for data: 414724404
-I0906 13:59:13.975101 16537 layer_factory.hpp:74] Creating layer loss
-I0906 13:59:13.975112 16537 net.cpp:91] Creating Layer loss
-I0906 13:59:13.975117 16537 net.cpp:411] loss <- fc8_fc8_0_split_1
-I0906 13:59:13.975128 16537 net.cpp:411] loss <- label_data_1_split_1
-I0906 13:59:13.975139 16537 net.cpp:369] loss -> loss
-I0906 13:59:13.975150 16537 net.cpp:121] Setting up loss
-I0906 13:59:13.975160 16537 layer_factory.hpp:74] Creating layer loss
-I0906 13:59:13.975487 16537 net.cpp:128] Top shape: (1)
-I0906 13:59:13.975492 16537 net.cpp:130]     with loss weight 1
-I0906 13:59:13.975507 16537 net.cpp:134] Memory required for data: 414724408
-I0906 13:59:13.975513 16537 net.cpp:193] loss needs backward computation.
-I0906 13:59:13.975520 16537 net.cpp:195] accuracy does not need backward computation.
-I0906 13:59:13.975528 16537 net.cpp:193] fc8_fc8_0_split needs backward computation.
-I0906 13:59:13.975533 16537 net.cpp:193] fc8 needs backward computation.
-I0906 13:59:13.975538 16537 net.cpp:193] relu7 needs backward computation.
-I0906 13:59:13.975544 16537 net.cpp:193] fc7 needs backward computation.
-I0906 13:59:13.975549 16537 net.cpp:193] relu6 needs backward computation.
-I0906 13:59:13.975555 16537 net.cpp:193] fc6 needs backward computation.
-I0906 13:59:13.975560 16537 net.cpp:193] pool5 needs backward computation.
-I0906 13:59:13.975566 16537 net.cpp:193] relu5 needs backward computation.
-I0906 13:59:13.975572 16537 net.cpp:193] conv5 needs backward computation.
-I0906 13:59:13.975577 16537 net.cpp:193] relu4 needs backward computation.
-I0906 13:59:13.975582 16537 net.cpp:193] conv4 needs backward computation.
-I0906 13:59:13.975589 16537 net.cpp:193] relu3 needs backward computation.
-I0906 13:59:13.975594 16537 net.cpp:193] conv3 needs backward computation.
-I0906 13:59:13.975600 16537 net.cpp:193] pool2 needs backward computation.
-I0906 13:59:13.975605 16537 net.cpp:193] norm2 needs backward computation.
-I0906 13:59:13.975611 16537 net.cpp:193] relu2 needs backward computation.
-I0906 13:59:13.975616 16537 net.cpp:193] conv2 needs backward computation.
-I0906 13:59:13.975622 16537 net.cpp:193] pool1 needs backward computation.
-I0906 13:59:13.975628 16537 net.cpp:193] norm1 needs backward computation.
-I0906 13:59:13.975635 16537 net.cpp:193] relu1 needs backward computation.
-I0906 13:59:13.975639 16537 net.cpp:193] conv1 needs backward computation.
-I0906 13:59:13.975646 16537 net.cpp:195] label_data_1_split does not need backward computation.
-I0906 13:59:13.975654 16537 net.cpp:195] data does not need backward computation.
-I0906 13:59:13.975658 16537 net.cpp:236] This network produces output accuracy
-I0906 13:59:13.975664 16537 net.cpp:236] This network produces output loss
-I0906 13:59:13.975702 16537 net.cpp:483] Collecting Learning Rate and Weight Decay.
-I0906 13:59:13.975714 16537 net.cpp:248] Network initialization done.
-I0906 13:59:13.975718 16537 net.cpp:249] Memory required for data: 414724408
-I0906 13:59:13.975903 16537 solver.cpp:53] Solver scaffolding done.
-I0906 13:59:13.976030 16537 solver.cpp:270] Solving AlexNet
-I0906 13:59:13.976050 16537 solver.cpp:271] Learning Rate Policy: step
-I0906 13:59:13.977635 16537 solver.cpp:314] Iteration 0, Testing net (#0)
-I0906 13:59:13.977653 16537 net.cpp:696] Copying source layer data
-I0906 13:59:13.977660 16537 net.cpp:696] Copying source layer conv1
-I0906 13:59:13.980556 16537 net.cpp:696] Copying source layer relu1
-I0906 13:59:13.980595 16537 net.cpp:696] Copying source layer norm1
-I0906 13:59:13.980607 16537 net.cpp:696] Copying source layer pool1
-I0906 13:59:13.980617 16537 net.cpp:696] Copying source layer conv2
-I0906 13:59:13.980785 16537 net.cpp:696] Copying source layer relu2
-I0906 13:59:13.980798 16537 net.cpp:696] Copying source layer norm2
-I0906 13:59:13.980808 16537 net.cpp:696] Copying source layer pool2
-I0906 13:59:13.980818 16537 net.cpp:696] Copying source layer conv3
-I0906 13:59:13.981422 16537 net.cpp:696] Copying source layer relu3
-I0906 13:59:13.981437 16537 net.cpp:696] Copying source layer conv4
-I0906 13:59:13.982098 16537 net.cpp:696] Copying source layer relu4
-I0906 13:59:13.982115 16537 net.cpp:696] Copying source layer conv5
-I0906 13:59:13.982612 16537 net.cpp:696] Copying source layer relu5
-I0906 13:59:13.982626 16537 net.cpp:696] Copying source layer pool5
-I0906 13:59:13.982636 16537 net.cpp:696] Copying source layer fc6
-I0906 13:59:13.993058 16537 net.cpp:696] Copying source layer relu6
-I0906 13:59:13.993091 16537 net.cpp:696] Copying source layer fc7
-I0906 13:59:13.997967 16537 net.cpp:696] Copying source layer relu7
-I0906 13:59:13.997984 16537 net.cpp:696] Copying source layer fc8
-I0906 13:59:13.998755 16537 net.cpp:696] Copying source layer loss
-I0906 13:59:13.998867 16537 base_data_layer.cpp:89] Thread joined
-I0906 13:59:14.003283 16537 base_data_layer.cpp:93] Prefetch copied
-I0906 13:59:14.003650 16537 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:59:14.096194 16546 data_layer.cpp:120] Prefetch batch: 92 ms.
-I0906 13:59:14.096225 16546 data_layer.cpp:121]      Read time: 12.131 ms.
-I0906 13:59:14.096233 16546 data_layer.cpp:122] Transform time: 79.106 ms.
-I0906 13:59:17.032117 16537 solver.cpp:363]     Test net output #0: accuracy = 0
-I0906 13:59:17.032146 16537 solver.cpp:363]     Test net output #1: loss = 6.91124 (* 1 = 6.91124 loss)
-I0906 13:59:17.032196 16537 base_data_layer.cpp:89] Thread joined
-I0906 13:59:17.041095 16537 base_data_layer.cpp:93] Prefetch copied
-I0906 13:59:17.041471 16537 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:59:17.232076 16547 data_layer.cpp:120] Prefetch batch: 190 ms.
-I0906 13:59:17.232108 16547 data_layer.cpp:121]      Read time: 24.399 ms.
-I0906 13:59:17.232116 16547 data_layer.cpp:122] Transform time: 164.272 ms.
-I0906 13:59:23.802855 16537 solver.cpp:234] Iteration 0, loss = 0
-I0906 13:59:23.802914 16537 solver.cpp:249]     Train net output #0: loss = 6.89773 (* 1 = 6.89773 loss)
-I0906 13:59:23.802963 16537 solver.cpp:506] Iteration 0, lr = 0.01
-I0906 13:59:23.918314 16537 base_data_layer.cpp:89] Thread joined
-I0906 13:59:23.926301 16537 base_data_layer.cpp:93] Prefetch copied
-I0906 13:59:23.926447 16537 base_data_layer.cpp:104] CreatePrefetchThread
-I0906 13:59:24.110566 16549 data_layer.cpp:120] Prefetch batch: 183 ms.
-I0906 13:59:24.110599 16549 data_layer.cpp:121]      Read time: 23.839 ms.
-I0906 13:59:24.110605 16549 data_layer.cpp:122] Transform time: 158.415 ms.
-I0906 13:59:26.694295 16537 solver.cpp:234] Iteration 1, loss = 0
diff --git a/log/caffe.INFO b/log/caffe.INFO
deleted file mode 120000
index 65520a80..00000000
--- a/log/caffe.INFO
+++ /dev/null
@@ -1 +0,0 @@
-caffe.AMD-RESEARCH.yugao.log.INFO.20150906-135855.16537
\ No newline at end of file

From 5698e3c53d3f9e9010a9375174d1fe69c5b583cc Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 6 Sep 2015 16:59:36 +0800
Subject: [PATCH 059/124] Ported hdf5_data hdf5_output log and mvn layer

---
 include/caffe/util/ocl_wrapper.hpp     |  21 ++++
 src/caffe/layers/hdf5_data_layer.cpp   |  27 +++++
 src/caffe/layers/hdf5_output_layer.cpp |  16 +++
 src/caffe/layers/log_layer.cpp         |  35 +++++++
 src/caffe/layers/mvn_layer.cpp         | 104 +++++++++++++++++++
 src/caffe/layers/relu_layer.cl         |  22 ----
 src/caffe/ocl/util.cl                  |  77 ++++++++++++++
 src/caffe/util/math_functions.cpp      |  74 +++++++++-----
 src/caffe/util/ocl_wrapper.cpp         | 133 +++++++++++++++++++++++++
 9 files changed, 464 insertions(+), 45 deletions(-)
 delete mode 100644 src/caffe/layers/relu_layer.cl

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index de188e11..90d22752 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -146,6 +146,27 @@ void kernel_channel_subtract(const int count,
     const int num, const int channels,
     const int spatial_dim, const Dtype* channel_max, Dtype* data);
 
+template <typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out);
+
+template <typename Dtype>
+void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_log(const int count, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_add_scalar(const int count, const Dtype data, Dtype* out);
+
 template <typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out);
 
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 649dc020..dda29aee 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -160,6 +160,33 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+  for (int i = 0; i < batch_size; ++i, ++current_row_) {
+    if (current_row_ == hdf_blobs_[0]->shape(0)) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          if (this->layer_param_.hdf5_data_param().shuffle()) {
+            std::random_shuffle(file_permutation_.begin(),
+                                file_permutation_.end());
+          }
+          DLOG(INFO) << "Looping around to first file.";
+        }
+        LoadHDF5FileData(
+            hdf_filenames_[file_permutation_[current_file_]].c_str());
+      }
+      current_row_ = 0;
+      if (this->layer_param_.hdf5_data_param().shuffle())
+        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+    }
+    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+      int data_dim = top[j]->count() / top[j]->shape(0);
+      caffe_copy(data_dim,
+          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+            * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
+    }
+  }
 }
 
 
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index 7d1ca097..bd608e86 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -70,6 +70,22 @@ void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+                     bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+                     bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim],
+        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
+    caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim],
+        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
+  }
+  SaveBlobs();
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 9d3977a7..461fd9bf 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -80,11 +80,46 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+    caffe_gpu_log(count, bottom_data, top_data);
+  } else {
+    caffe_copy(count, bottom_data, top_data);
+    if (input_scale_ != Dtype(1)) {
+      caffe_gpu_scal(count, input_scale_, top_data);
+    }
+    if (input_shift_ != Dtype(0)) {
+      caffe_gpu_add_scalar(count, input_shift_, top_data);
+    }
+    caffe_gpu_log(count, top_data, top_data);
+  }
+  if (base_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, base_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (!propagate_down[0]) { return; }
+    const int count = bottom[0]->count();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_copy(count, bottom_data, bottom_diff);
+    if (input_scale_ != Dtype(1)) {
+      caffe_gpu_scal(count, input_scale_, bottom_diff);
+    }
+    if (input_shift_ != Dtype(0)) {
+      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
+    }
+    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+    if (backward_num_scale_ != Dtype(1)) {
+      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
+    }
+    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
 
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 84701831..cbeeb150 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -137,11 +137,115 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    // computes variance using var(X) = E(X^2) - (EX)^2
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
+        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
+        sum_multiplier_.gpu_data(), 0.,
+        variance_.mutable_gpu_data());  // E(X^2)
+    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
+        temp_.mutable_gpu_data());  // (EX)^2
+    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
+        variance_.mutable_gpu_data());  // variance
+
+    // do mean and variance normalization
+    // subtract mean
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+            temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+
+    // normalize variance
+    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+          variance_.mutable_gpu_data());
+
+    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
+
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+          temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
+  } else {
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
+            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+
+    // subtract mean
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+            temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+  }
 }
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
+          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+          bottom_diff);
+    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
+            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
+            bottom_diff);
+
+    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+        bottom_diff);
+
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+        temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
+  } else {
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, top_diff,
+            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+            temp_.mutable_gpu_data());
+    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
+  }
 }
 
 
diff --git a/src/caffe/layers/relu_layer.cl b/src/caffe/layers/relu_layer.cl
deleted file mode 100644
index cebe24cd..00000000
--- a/src/caffe/layers/relu_layer.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-template <class T>
-__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
-        int index = get_global_id(0);
-        if(index < count)
-                out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
-}
-
-//template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float4* in, __global float4* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwardfloat))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
-template __attribute__ ((mangled_name(ReLUForwarddouble))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
-
-template <class T>
-__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
-        int index = get_global_id(0);
-        if(index < count)
-                out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
-}
-
-template __attribute__ ((mangled_name(ReLUBackwardfloat))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
-template __attribute__ ((mangled_name(ReLUBackwarddouble))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
-
-
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index cda05652..7c907058 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -90,6 +90,62 @@ __kernel void exp (const int num, __global T* data, __global T* out){
 template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
 template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
 
+template <class T>
+__kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = a[index] - b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_sub(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = a[index] + b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_add(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = a[index] / b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_div(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = a[index] * b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out);
+
+
+template <class T>
+__kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+    out[index] = pow(data[index], alpha);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out);
+template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out);
+
 
 template <class T>
 __kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
@@ -102,6 +158,27 @@ __kernel void kernel_exp(const int count, __global const T* data, __global T* ou
 template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
 template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
 
+template <class T>
+__kernel void kernel_add_scalar(const int count, const T data, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+     out[index] = out[index] + data;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out);
+template __attribute__ ((mangled_name(kernel_add_scalar__double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out);
+
+template <class T>
+__kernel void kernel_log(const int count, __global const T* data, __global T* out) {
+ int index = get_global_id(0);
+   if(index < count) {
+     out[index] = log(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out);
 
 template <class T>
 __kernel void diff (const int num, const int dim, __global T* data, __global T* label){
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 6b2276ca..34442442 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -653,31 +653,24 @@ void caffe_gpu_set(const int N, const double alpha, double* Y) {
   }
 }
 
-template <typename Dtype>
-void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
-}
-
 template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
+  kernel_add_scalar(N, alpha, Y);
 }
 
 template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
-}
-
-template <typename Dtype>
-void mul_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
+  kernel_add_scalar(N, alpha, Y);
 }
 
 template <>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
-    kernel_exp(N, a, y);
+  kernel_exp(N, a, y);
 }
 
 template <>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
-    kernel_exp(N, a, y);
+  kernel_exp(N, a, y);
 }
 
 template<>
@@ -690,9 +683,24 @@ void caffe_gpu_sign<double>(const int N, const double *X, double *Y){
    caffe_gpu_sign_ocl(N, X, Y);
 }
 
+template <>
+void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
+}
+
 template <>
 void caffe_gpu_mul<float>(const int N, const float* a,
     const float* b, float* y) {
+  kernel_mul(N, a, b, y);
 }
 
 template <>
@@ -700,26 +708,31 @@ void caffe_gpu_mul<double>(const int N, const double* a,
     const double* b, double* y) {
 }
 
-template <typename Dtype>
-void div_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-}
-
 template <>
 void caffe_gpu_div<float>(const int N, const float* a,
     const float* b, float* y) {
+  kernel_div(N, a, b, y);
 }
 
 template <>
 void caffe_gpu_div<double>(const int N, const double* a,
     const double* b, double* y) {
+  kernel_div(N, a, b, y);
 }
 
-template <typename Dtype>
-void powx_kernel(const int n, const Dtype* a,
-    const Dtype alpha, Dtype* y) {
+template <>
+void caffe_gpu_powx<float>(const int N, const float* a,
+    const float alpha, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
 }
 
+template <>
+void caffe_gpu_powx<double>(const int N, const double* a,
+    const double alpha, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
+}
 
 void popc_kernel(const int n, const float* a,
     const float* b, uint8_t* y) {
@@ -763,6 +776,23 @@ void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
                             double* r) {
 }
 
+template <>
+void caffe_gpu_log<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
+}
+
+template <>
+void caffe_gpu_log<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
+}
+
+
+
+
+
+
 template <>
 void caffe_log<float>(const int n, const float* a, float* y) {
   vsLn(n, a, y);
@@ -809,16 +839,14 @@ template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
     float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
- // add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-   //   N, a, b, y);
+  kernel_add(N, a, b, y);
 }
 
 template <>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
     double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
- // add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-   //   N, a, b, y);
+  kernel_add(N, a, b, y);
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index f4b43acf..14caf874 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -205,6 +205,139 @@ template void kernel_channel_subtract<double>( const int count,
     const int num, const int channels,
     const int spatial_dim, const double* channel_max, double* data);
 
+template <typename Dtype>
+void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out)
+{
+    std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_mul<float>(const int count, const float* a, const float* b, float* out);
+template void kernel_mul<double>(const int count, const double* a, const double* b, double* out);
+
+template <typename Dtype>
+void kernel_add_scalar(const int count, const Dtype data, Dtype* out)
+{
+    std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_add_scalar<float>(const int count, const float data, float* out);
+template void kernel_add_scalar<double>(const int count, const double data, double* out);
+
+
+template <typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out)
+{
+    std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_powx<float>(const int count, const float* data, const float alpha, float* out);
+template void kernel_powx<double>(const int count, const double* data, const double alpha, double* out);
+
+template <typename Dtype>
+void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out)
+{
+    std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_div<float>(const int count, const float* a, const float* b, float* out);
+template void kernel_div<double>(const int count, const double* a, const double* b, double* out);
+
+template <typename Dtype>
+void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out)
+{
+    std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_add<float>(const int count, const float* a, const float* b, float* out);
+template void kernel_add<double>(const int count, const double* a, const double* b, double* out);
+
+template <typename Dtype>
+void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out)
+{
+    std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
+    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_sub<float>(const int count, const float* a, const float* b, float* out);
+template void kernel_sub<double>(const int count, const double* a, const double* b, double* out);
+
+template <typename Dtype>
+void kernel_log(const int count, const Dtype* data, Dtype* out)
+{
+    std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
+    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
+    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+
+    size_t Global_Work_Size[1] = {(size_t)count};
+    size_t Local_Work_Size[1] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+}
+
+template void kernel_log<float>(const int count, const float* data, float* out);
+template void kernel_log<double>(const int count, const double* data, double* out);
+
+
 template <typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out)
 {

From 0ccf6587350ad1084730643a3b322040f92698cd Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 6 Sep 2015 19:54:30 +0800
Subject: [PATCH 060/124] Port absval_layer bnll_layer concat_layer
 contrastive_loss_layer deconv_layer eltwise_layer euclidean_loss_layer
 exp_layer & filter_layer

---
 include/caffe/common.hpp                    |   2 +-
 include/caffe/util/ocl_wrapper.hpp          |  25 ++++
 src/caffe/layers/absval_layer.cpp           |  11 ++
 src/caffe/layers/bnll_layer.cpp             |  18 ++-
 src/caffe/layers/concat_layer.cpp           |  32 +++-
 src/caffe/layers/contrastive_loss_layer.cpp |  62 +++++++-
 src/caffe/layers/deconv_layer.cpp           |  46 +++++-
 src/caffe/layers/eltwise_layer.cpp          |  78 +++++++++-
 src/caffe/layers/euclidean_loss_layer.cpp   |  28 +++-
 src/caffe/layers/filter_layer.cpp           |  54 ++++++-
 src/caffe/ocl/bnll_layer.cl                 |  26 ++++
 src/caffe/ocl/concat_layer.cl               |  28 ++++
 src/caffe/ocl/contrastive_loss_layer.cl     |  38 +++++
 src/caffe/ocl/eltwise_layer.cl              |  47 ++++++
 src/caffe/util/ocl_wrapper.cpp              | 155 ++++++++++++++++++++
 15 files changed, 629 insertions(+), 21 deletions(-)
 create mode 100644 src/caffe/ocl/bnll_layer.cl
 create mode 100644 src/caffe/ocl/concat_layer.cl
 create mode 100644 src/caffe/ocl/contrastive_loss_layer.cl
 create mode 100644 src/caffe/ocl/eltwise_layer.cl

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 8c738ca3..c5bf909d 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -81,7 +81,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 0 
+#define use_packing_scheme 1 
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 90d22752..d644d16a 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -223,6 +223,31 @@ void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y);
 
 template <typename Dtype>
 void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
+
+template <typename Dtype>
+void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff);
+
+template <typename Dtype>
+void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data);
+
+template <typename Dtype>
+void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
+    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+    Dtype *bottom_diff);
+
+template <typename Dtype>
+void MaxForward(const int nthreads, const Dtype* bottom_data_a,
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+    int* mask);
+
+template <typename Dtype>
+void MaxBackward(const int nthreads, const Dtype* top_diff,
+    const int blob_idx, const int* mask, Dtype* bottom_diff);
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
   // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 30422737..12776eb8 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -38,11 +38,22 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
 }
 
 template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  const int count = top[0]->count();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_sign(count, bottom_data, bottom_diff);
+    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 09e2bc89..3fe6f42e 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -40,12 +40,26 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  BNLLForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    BNLLBackward(count, top_diff, bottom_data, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 6af287a9..d1d0e927 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -90,14 +90,42 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+      const vector<Blob<Dtype>*>& top) {
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = true;
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+    const int nthreads = bottom_concat_size * num_concats_;
+    Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
+        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = false;
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (!propagate_down[i]) { continue; }
+    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+    const int nthreads = bottom_concat_size * num_concats_;
+    Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
+
 #ifdef CPU_ONLY
 STUB_GPU(ConcatLayer);
 #endif
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index aad4cab3..4b47eb42 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -112,13 +112,69 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+void ContrastiveLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  caffe_gpu_sub(
+      count,
+      bottom[0]->gpu_data(),  // a
+      bottom[1]->gpu_data(),  // b
+      diff_.mutable_gpu_data());  // a_i-b_i
+  caffe_gpu_powx(
+      count,
+      diff_.mutable_gpu_data(),  // a_i-b_i
+      Dtype(2),
+      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
+  caffe_gpu_gemv(
+      CblasNoTrans,
+      bottom[0]->num(),
+      bottom[0]->channels(),
+      Dtype(1.0),
+      diff_sq_.gpu_data(),  // (a_i-b_i)^2
+      summer_vec_.gpu_data(),
+      Dtype(0.0),
+      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
+  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
+  Dtype loss(0.0);
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+      loss += dist_sq_.cpu_data()[i];
+    } else {  // dissimilar pairs
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist*dist;
+      }
+    }
+  }
+  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const int count = bottom[0]->count();
+      const int channels = bottom[0]->channels();
+      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+      const bool legacy_version =
+          this->layer_param_.contrastive_loss_param().legacy_version();
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+          static_cast<Dtype>(bottom[0]->num());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      CLLBackward(count, channels, margin, legacy_version, alpha,
+          bottom[2]->gpu_data(),  // pair similarity 0 or 1
+          diff_.gpu_data(),  // the cached eltwise difference between a and b
+          dist_sq_.gpu_data(),  // the cached square distance between a and b
+          bottom[i]->mutable_gpu_diff());
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index e8937238..ad9a690e 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -71,16 +71,56 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      for (int n = 0; n < this->num_; ++n) {
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
+              bottom_data + bottom[i]->offset(n), weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
+        }
+      }
+    }
+  }
 }
 
 
-
 #ifdef CPU_ONLY
 STUB_GPU(DeconvolutionLayer);
 #endif
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index cffc743d..61417f8c 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -153,16 +153,88 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+    const vector<Blob<Dtype>*>& top) {
+  int* mask = NULL;
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  switch (op_) {
+  case EltwiseParameter_EltwiseOp_PROD:
+    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+        top_data);
+    for (int i = 2; i < bottom.size(); ++i) {
+      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_SUM:
+    caffe_gpu_set(count, Dtype(0.), top_data);
+    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
+    for (int i = 0; i < bottom.size(); ++i) {
+      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_MAX:
+    mask = max_idx_.mutable_gpu_data();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
+    for (int i = 2; i < bottom.size(); ++i) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      MaxForward(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown elementwise operation.";
+  }
 }
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int* mask = NULL;
+  const int count = top[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      switch (op_) {
+      case EltwiseParameter_EltwiseOp_PROD:
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom.size(); ++j) {
+            if (i == j) { continue; }
+            if (!initialized) {
+              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
+                            bottom_diff);
+            }
+          }
+        } else {
+          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        }
+        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+        break;
+      case EltwiseParameter_EltwiseOp_SUM:
+        if (coeffs_[i] == Dtype(1.)) {
+          caffe_copy(count, top_diff, bottom_diff);
+        } else {
+          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+        }
+        break;
+      case EltwiseParameter_EltwiseOp_MAX:
+        mask = max_idx_.gpu_data();
+        MaxBackward(count, top_diff, i, mask, bottom_diff);
+        break;
+      default:
+        LOG(FATAL) << "Unknown elementwise operation.";
+      }
+    }
+  }
 }
 
 
-
 #ifdef CPU_ONLY
 STUB_GPU(EltwiseLayer);
 #endif
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 9c37c18b..d1efe5bb 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -49,16 +49,36 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+    const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_gpu_sub(
+      count,
+      bottom[0]->gpu_data(),
+      bottom[1]->gpu_data(),
+      diff_.mutable_gpu_data());
+  Dtype dot;
+  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
+  Dtype loss = dot / bottom[0]->num() / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_gpu_axpby(
+          bottom[i]->count(),              // count
+          alpha,                              // alpha
+          diff_.gpu_data(),                   // a
+          Dtype(0),                           // beta
+          bottom[i]->mutable_gpu_diff());  // b
+    }
+  }
 }
 
-
-
 #ifdef CPU_ONLY
 STUB_GPU(EuclideanLossLayer);
 #endif
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index 4d004ad4..c5f5e4dd 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -119,15 +119,63 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+      const vector<Blob<Dtype>*>& top) {
+  int new_tops_num = indices_to_forward_.size();
+  // forward all filtered items for all bottoms but the Selector (bottom[last])
+  for (int t = 0; t < top.size(); ++t) {
+    const Dtype* bottom_data = bottom[t]->gpu_data();
+    Dtype* top_data = top[t]->mutable_gpu_data();
+    int dim = bottom[t]->count() / bottom[t]->shape(0);
+    for (int n = 0; n < new_tops_num; ++n) {
+      int data_offset_top = n * dim;
+      int data_offset_bottom = indices_to_forward_[n] * dim;
+      caffe_copy(dim, bottom_data + data_offset_bottom,
+          top_data + data_offset_top);
+    }
+  }
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[bottom.size() - 1]) {
+    LOG(FATAL) << this->type()
+               << "Layer cannot backpropagate to filter index inputs";
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    // bottom[last] is the selector and never needs backpropagation
+    // so we can iterate over top vector because top.size() == bottom.size() -1
+    if (propagate_down[i]) {
+      const int dim = top[i]->count() / top[i]->shape(0);
+      int next_to_backward_offset = 0;
+      int batch_offset = 0;
+      int data_offset_bottom = 0;
+      int data_offset_top = 0;
+      for (int n = 0; n < bottom[i]->shape(0); ++n) {
+        if (next_to_backward_offset >= indices_to_forward_.size()) {
+          // we already visited all items that were been forwarded, so
+          // just set to zero remaining ones
+          data_offset_bottom = n * dim;
+          caffe_gpu_set(dim, Dtype(0),
+              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+        } else {
+          batch_offset = indices_to_forward_[next_to_backward_offset];
+          data_offset_bottom = n * dim;
+          if (n != batch_offset) {  // this data was not been forwarded
+            caffe_gpu_set(dim, Dtype(0),
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          } else {  // this data was been forwarded
+            data_offset_top = next_to_backward_offset * dim;
+            ++next_to_backward_offset;  // point to next forwarded item index
+            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          }
+        }
+      }
+    }
+  }
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(FilterLayer);
 #endif
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
new file mode 100644
index 00000000..c297db75
--- /dev/null
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -0,0 +1,26 @@
+#define kBNLL_THRESHOLD  50.0
+
+template <class T>
+__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n) {
+    out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
+  }
+}
+template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out);
+template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out);
+
+template <class T>
+__kernel void BNLLBackward(const int n, __global const T* in_diff,
+    __global const T* in_data, __global T* out_diff) {
+    int index = get_global_id(0);
+    if (index < n) {
+      T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
+      out_diff[index] = in_diff[index] * expval / (expval + 1.);
+  }
+}
+
+template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff,
+    __global const float* in_data, __global float* out_diff);
+template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff,
+    __global const double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
new file mode 100644
index 00000000..dfcbfbc5
--- /dev/null
+++ b/src/caffe/ocl/concat_layer.cl
@@ -0,0 +1,28 @@
+template <class T>
+__kernel void Concat(const int nthreads, __global const T* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global T* out_data) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+        const int total_concat_size = concat_size * bottom_concat_axis;
+        const int concat_num = index / total_concat_size;
+        const int concat_index = index % total_concat_size;
+        const int top_index = concat_index +
+            (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+        if (forward) {
+            out_data[top_index] = in_data[index];
+        } else {
+            out_data[index] = in_data[top_index];
+        }
+    }
+}
+
+template __attribute__((mangled_name(Concat_float))) __kernel void  Concat(const int nthreads, __global const float* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global float* out_data);
+template __attribute__((mangled_name(Concat_double))) __kernel void  Concat(const int nthreads, __global const double* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
new file mode 100644
index 00000000..5a67e399
--- /dev/null
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -0,0 +1,38 @@
+template <class Dtype>
+__kernel void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
+    __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
+    __global Dtype *bottom_diff) {
+    int i = get_global_id(0);
+    if(i < count) {
+        int n = i / channels;  // the num index, to access y and dist_sq
+        if (static_cast<int>(y[n])) {  // similar pairs
+            bottom_diff[i] = alpha * diff[i];
+        } else {  // dissimilar pairs
+            Dtype mdist(0.0);
+            Dtype beta(0.0);
+            if (legacy_version) {
+                mdist = (margin - dist_sq[n]);
+                beta = -alpha;
+            } else {
+                Dtype dist = sqrt(dist_sq[n]);
+                mdist = (margin - dist);
+                beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+            }
+            if (mdist > 0.0) {
+                bottom_diff[i] = beta;
+            } else {
+                bottom_diff[i] = 0;
+            }
+       }
+   }
+}
+
+template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels,
+    const float margin, const bool legacy_version, const float alpha,
+    __global const float* y, __global const float* diff, __global const float* dist_sq,
+    __global float *bottom_diff);
+template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels,
+    const double margin, const bool legacy_version, const double alpha,
+    __global const double* y, __global const double* diff, __global const double* dist_sq,
+    __global double *bottom_diff);
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
new file mode 100644
index 00000000..1be9cb43
--- /dev/null
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -0,0 +1,47 @@
+template <class Dtype>
+__kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
+    __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
+    __global int* mask) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+    Dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    if (bottom_data_a[index] > bottom_data_b[index]) {
+      // only update for very first bottom_data blob (blob_idx == 0)
+      if (blob_idx == 0) {
+        maxval = bottom_data_a[index];
+        top_data[index] = maxval;
+        maxidx = blob_idx;
+        mask[index] = maxidx;
+      }
+    } else {
+      maxval = bottom_data_b[index];
+      top_data[index] = maxval;
+      maxidx = blob_idx + 1;
+      mask[index] = maxidx;
+    }
+  }
+}
+template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a,
+    __global const float* bottom_data_b, const int blob_idx, __global float* top_data,
+    __global int* mask);
+template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a,
+    __global const double* bottom_data_b, const int blob_idx, __global double* top_data,
+    __global int* mask);
+
+template <class Dtype>
+__kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
+    const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+        Dtype gradient = 0;
+        if (mask[index] == blob_idx) {
+            gradient += top_diff[index];
+        }
+        bottom_diff[index] = gradient;
+    }
+}
+template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff,
+    const int blob_idx, __global const int* mask, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff,
+    const int blob_idx, __global const int* mask, __global double* bottom_diff);
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 14caf874..f2897538 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1209,6 +1209,160 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
 template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
 template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
 
+
+template <typename Dtype>
+void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
+{
+    std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&bottom_data);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  BNLLForward<float>(const int count, const float* bottom_data, float *top_data);
+template void  BNLLForward<double>(const int count, const double* bottom_data, double *top_data);
+
+template <typename Dtype>
+void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff)
+{
+    std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&bottom_data);
+    ret |= clSetKernelArg(kernel,3,sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  BNLLBackward<float>(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff);
+template void  BNLLBackward<double>(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff);
+
+
+template <typename Dtype>
+void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data)
+{
+    std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
+    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&in_data);
+    ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool),  (void*)&forward);
+    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&num_concats);
+    ret |= clSetKernelArg(kernel, 4, sizeof(cl_int),  (void*)&concat_size);
+    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&top_concat_axis);
+    ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem),  (void*)&bottom_concat_axis); 
+    ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem),  (void*)&offset_concat_axis);
+    ret |= clSetKernelArg(kernel, 8, sizeof(cl_int),  (void*)&out_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)nthreads};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  Concat<float>(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data);
+template void  Concat<double>(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data);
+
+template <typename Dtype>
+void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
+    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+    Dtype *bottom_diff)
+{
+    std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel, 1, sizeof(cl_int),  (void*)&channels);
+    ret |= clSetKernelArg(kernel, 2, sizeof(Dtype),   (void*)&margin);
+    ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool),  (void*)&legacy_version);
+    ret |= clSetKernelArg(kernel, 4, sizeof(Dtype),   (void*)&alpha);
+    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&y);
+    ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem),  (void*)&diff);
+    ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem),  (void*)&dist_sq);
+    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void CLLBackward<float>(const int count, const int channels,
+    const float margin, const bool legacy_version, const float alpha,
+    const float* y, const float* diff, const float* dist_sq,
+    float *bottom_diff);
+template void CLLBackward<double>(const int count, const int channels,
+    const double margin, const bool legacy_version, const double alpha,
+    const double* y, const double* diff, const double* dist_sq,
+    double *bottom_diff);
+
+template <typename Dtype>
+void MaxForward(const int nthreads, const Dtype* bottom_data_a,
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+    int* mask)
+{
+    std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
+    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&bottom_data_a);
+    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem),  (void*)&bottom_data_b);
+    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&blob_idx);
+    ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem),  (void*)&top_data);
+    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&mask);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)nthreads};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void MaxForward<float>(const int nthreads, const float* bottom_data_a,
+    const float* bottom_data_b, const int blob_idx, float* top_data,
+    int* mask);
+template void MaxForward<double>(const int nthreads, const double* bottom_data_a,
+    const double* bottom_data_b, const int blob_idx, double* top_data,
+    int* mask);
+
+template <typename Dtype>
+void MaxBackward(const int nthreads, const Dtype* top_diff,
+    const int blob_idx, const int* mask, Dtype* bottom_diff)
+{
+    std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
+    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel, 2, sizeof(cl_int),  (void*)&blob_idx);
+    ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem),  (void*)&mask);
+    ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)nthreads};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void MaxBackward<float>(const int nthreads, const float* top_diff, const int blob_idx, const int* mask, float* bottom_diff);
+template void MaxBackward<double>(const int nthreads, const double* top_diff, const int blob_idx, const int* mask, double* bottom_diff);
+
+
 template <typename Dtype>
 void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
 {
@@ -1216,4 +1370,5 @@ void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
 template void ocl_conv<float>(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
 template void ocl_conv<double>(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
 
+
 }  // namespace caffe

From dfc6cb133c929a173d5d34a1c367bb3e4af136bf Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 6 Sep 2015 22:31:19 +0800
Subject: [PATCH 061/124] Fix some bugs in layers' porting

---
 src/caffe/common.cpp                          |  2 ++
 src/caffe/layers/deconv_layer.cpp             | 11 +++++---
 src/caffe/layers/im2col_layer.cpp             | 20 ++++++++++++--
 src/caffe/layers/softmax_loss_layer.cpp       |  2 +-
 src/caffe/ocl/bnll_layer.cl                   | 26 +++++++++++++++++++
 src/caffe/ocl/concat_layer.cl                 | 26 +++++++++++++++++++
 src/caffe/ocl/contrastive_loss_layer.cl       | 26 +++++++++++++++++++
 src/caffe/ocl/eltwise_layer.cl                | 26 +++++++++++++++++++
 src/caffe/ocl/util.cl                         |  2 +-
 .../test/test_data/generate_sample_data.py    |  0
 src/caffe/util/math_functions.cpp             | 18 +++++++++++++
 src/caffe/util/ocl_wrapper.cpp                | 24 ++++++++---------
 12 files changed, 164 insertions(+), 19 deletions(-)
 mode change 100644 => 100755 src/caffe/test/test_data/generate_sample_data.py

diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index e12c48c9..c1d26ab8 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -104,6 +104,8 @@ Caffe::~Caffe() {
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
+  // RNG seed
+  Get().random_generator_.reset(new RNG(seed));
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index ad9a690e..4b952c73 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -77,11 +77,12 @@ void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->top_offset_ = top[i]->offset(n);
+      this->backward_gpu_gemm(bottom_data, weight, top_data);
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
+        this->forward_gpu_bias(top_data, bias);
       }
     }
   }
@@ -100,11 +101,15 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
         this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           this->weight_gpu_gemm(top_diff + top[i]->offset(n),
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index ddf6c989..7b667172 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -87,14 +87,30 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_,
+        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, top_data, top[0]->offset(n));
+  }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_,
+        stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
+  }
 }
 
+
+
 #ifdef CPU_ONLY
 STUB_GPU(Im2colLayer);
 #endif
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 22456302..d8db1797 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -152,7 +152,6 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
   Dtype loss;
   caffe_gpu_asum(nthreads, loss_data, &loss);
-  printf("loss = %f\n", loss);
   if (normalize_) {
     Dtype count;
     caffe_gpu_asum(nthreads, counts, &count);
@@ -160,6 +159,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   } else {
     loss /= outer_num_;
   }
+  printf("loss = %f\n", loss);
   top[0]->mutable_cpu_data()[0] = loss;
   if (top.size() == 2) {
     top[1]->ShareData(prob_);
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
index c297db75..03ddba8a 100644
--- a/src/caffe/ocl/bnll_layer.cl
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #define kBNLL_THRESHOLD  50.0
 
 template <class T>
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
index dfcbfbc5..71eb8c77 100644
--- a/src/caffe/ocl/concat_layer.cl
+++ b/src/caffe/ocl/concat_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class T>
 __kernel void Concat(const int nthreads, __global const T* in_data,
     const bool forward, const int num_concats, const int concat_size,
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
index 5a67e399..8ed18ce4 100644
--- a/src/caffe/ocl/contrastive_loss_layer.cl
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class Dtype>
 __kernel void CLLBackward(const int count, const int channels,
     const Dtype margin, const bool legacy_version, const Dtype alpha,
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
index 1be9cb43..d843884a 100644
--- a/src/caffe/ocl/eltwise_layer.cl
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 template <class Dtype>
 __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
     __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index 7c907058..d15f168c 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -167,7 +167,7 @@ __kernel void kernel_add_scalar(const int count, const T data, __global T* out)
 }
 
 template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out);
-template __attribute__ ((mangled_name(kernel_add_scalar__double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out);
+template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out);
 
 template <class T>
 __kernel void kernel_log(const int count, __global const T* data, __global T* out) {
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
old mode 100644
new mode 100755
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 34442442..787f2b16 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -544,6 +544,12 @@ double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
 template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
     float* out) {
+    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL);
+    cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL);
+    clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
+    clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL);
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(d_out);
 }
 
 template <>
@@ -551,6 +557,12 @@ void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
     double * out) {
   //need to pass in scratchBuff
   //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(double)), NULL, NULL);
+    cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(double)), NULL, NULL);
+    clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
+    clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), out,0, NULL, NULL);
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(d_out);
 }
 
 template <>
@@ -597,6 +609,12 @@ void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
 
 template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
+    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_double)), NULL, NULL);
+    cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_double)), NULL, NULL);
+    clblasDasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
+    clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), y,0, NULL, NULL);
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(d_y);
 }
 
 //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index f2897538..b479ddff 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -966,8 +966,8 @@ void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in
   ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height);
   ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width);
   ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LFSkernel,7,sizeof(cl_float),(void*)&alpha_over_size);
-  ret|=clSetKernelArg(LFSkernel,8,sizeof(cl_float),(void*)&k);
+  ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size);
+  ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k);
   ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
   OCL_CHECK(ret);
   size_t uiGlobal_Work_Size[]={(size_t)nthreads};
@@ -990,7 +990,7 @@ void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
   ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
   ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
   ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale);
-  ret|=clSetKernelArg(LCOkernel,3,sizeof(cl_float),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta);
   ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
   OCL_CHECK(ret);
   size_t uiGlobal_Work_Size2[]={(size_t)nthreads};
@@ -1020,8 +1020,8 @@ void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
   ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height);
   ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width);
   ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LCDkernel,10,sizeof(cl_float),(void*)&negative_beta);
-  ret|=clSetKernelArg(LCDkernel,11,sizeof(cl_float),(void*)&cache_ratio);
+  ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio);
   ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
   OCL_CHECK(ret);
   size_t uiGlobal_Work_Size[]={(size_t)nthreads};
@@ -1117,7 +1117,7 @@ void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_float), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
     ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
     OCL_CHECK(ret);
     size_t Global_Work_Size[] = {(size_t)n};
@@ -1175,7 +1175,7 @@ void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMe
     ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
     ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
     ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
-    ret|=clSetKernelArg(kernel,3,sizeof(cl_float),(void*)&scale_);
+    ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_);
     ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
     OCL_CHECK(ret);
 
@@ -1198,7 +1198,7 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
     ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
     ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
     ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_);
-    ret |= clSetKernelArg(kernel,4,sizeof(cl_float),(void*)&scale_);
+    ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_);
     ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
     OCL_CHECK(ret);
 
@@ -1263,10 +1263,10 @@ void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const
     ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool),  (void*)&forward);
     ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&num_concats);
     ret |= clSetKernelArg(kernel, 4, sizeof(cl_int),  (void*)&concat_size);
-    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&top_concat_axis);
-    ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem),  (void*)&bottom_concat_axis); 
-    ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem),  (void*)&offset_concat_axis);
-    ret |= clSetKernelArg(kernel, 8, sizeof(cl_int),  (void*)&out_data);
+    ret |= clSetKernelArg(kernel, 5, sizeof(cl_int),  (void*)&top_concat_axis);
+    ret |= clSetKernelArg(kernel, 6, sizeof(cl_int),  (void*)&bottom_concat_axis); 
+    ret |= clSetKernelArg(kernel, 7, sizeof(cl_int),  (void*)&offset_concat_axis);
+    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&out_data);
     OCL_CHECK(ret);
 
     size_t Global_Work_Size[] = {(size_t)nthreads};

From 84d80c2e5a2244d2dcef0583fdb52e76f772bdb4 Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao@noplz.name>
Date: Mon, 7 Sep 2015 17:57:28 +0800
Subject: [PATCH 062/124] Ignore log dir

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 28f2aca8..434c7112 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,6 @@ LOCK
 LOG*
 CURRENT
 MANIFEST-*
+
+#log files
+log

From 097f69cfb98bb8bada7a52014b706bf4ad5f606e Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 8 Sep 2015 14:37:15 +0800
Subject: [PATCH 063/124] ported new layers

---
 include/caffe/util/ocl_wrapper.hpp            | 14 +++
 src/caffe/layers/reduction_layer.cpp          | 71 +++++++++++++++
 src/caffe/layers/relu_layer.cpp               |  4 +-
 .../sigmoid_cross_entropy_loss_layer.cpp      | 17 ++++
 src/caffe/layers/sigmoid_layer.cpp            | 14 +++
 src/caffe/layers/silence_layer.cpp            |  6 ++
 src/caffe/layers/tanh_layer.cpp               | 14 +++
 src/caffe/layers/threshold_layer.cpp          |  6 ++
 src/caffe/ocl/sigmoid_layer.cl                | 46 ++++++++++
 src/caffe/ocl/tanh_layer.cl                   | 46 ++++++++++
 src/caffe/ocl/threshold_layer.cl              | 36 ++++++++
 src/caffe/util/math_functions.cpp             |  9 +-
 src/caffe/util/ocl_wrapper.cpp                | 90 +++++++++++++++++++
 13 files changed, 365 insertions(+), 8 deletions(-)
 create mode 100644 src/caffe/ocl/sigmoid_layer.cl
 create mode 100644 src/caffe/ocl/tanh_layer.cl
 create mode 100644 src/caffe/ocl/threshold_layer.cl

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index d644d16a..0d5f4b2e 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -84,6 +84,20 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int
 
 template <typename Dtype>
  void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff);
+template <typename Dtype>
+void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff);
+
+template <typename Dtype>
+void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff);
+
+template <typename Dtype>
+void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data);
 
 template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data);
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index c4a8b4e0..4003ddd1 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -125,11 +125,82 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* mult_data = NULL;
+  if (sum_multiplier_.count() > 0) {
+    mult_data = sum_multiplier_.gpu_data();
+  }
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int i = 0; i < num_; ++i) {
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_asum(dim_, bottom_data, top_data);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    ++top_data;
+  }
+  if (coeff_ != Dtype(1)) {
+    // Reset the top_data pointer.
+    top_data = top[0]->mutable_gpu_data();
+    caffe_gpu_scal(num_, coeff_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (!propagate_down[0]) { return; }
+  // Get bottom_data, if needed.
+  const Dtype* bottom_data = NULL;
+  switch (op_) {
+  // Operations that don't need bottom_data
+  case ReductionParameter_ReductionOp_SUM:
+  case ReductionParameter_ReductionOp_MEAN:
+    break;
+  // Operations that need bottom_data
+  case ReductionParameter_ReductionOp_ASUM:
+  case ReductionParameter_ReductionOp_SUMSQ:
+    bottom_data = bottom[0]->gpu_data();
+    break;
+  default:
+    LOG(FATAL) << "Unknown reduction op: "
+        << ReductionParameter_ReductionOp_Name(op_);
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  for (int i = 0; i < num_; ++i) {
+    const Dtype bottom_coeff = (*top_diff) * coeff_;
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
+      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    bottom_diff += dim_;
+    ++top_diff;
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 784d2c91..c29d5baa 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -43,7 +43,7 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_gpu_data();
   const int count = bottom[0]->count();
   Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
- ReLUForward(count,bottom_data,top_data,negative_slope);
+  ReLUForward(count,bottom_data,top_data,negative_slope);
 }
 
 
@@ -57,7 +57,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     const int count = bottom[0]->count();
     Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-   ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope);
+    ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope);
   }
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 1a4329da..1c22fe19 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -73,6 +73,23 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    // First, compute the diff
+    const int count = bottom[0]->count();
+    const int num = bottom[0]->num();
+    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+    const Dtype* target = bottom[1]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
+    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+    // Scale down gradient
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 30ad9b0b..fa13a4c1 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -4,6 +4,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
@@ -42,11 +43,24 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SigmoidBackward(count, top_diff, top_data, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index ecd12d12..e36a5cad 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -26,6 +26,12 @@ void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      caffe_gpu_set(bottom[i]->count(), Dtype(0),
+                    bottom[i]->mutable_gpu_data());
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index abc09bbc..a922adbd 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -6,6 +6,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
@@ -40,11 +41,24 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  TanHForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    TanHBackward(count, top_diff, top_data, bottom_diff);
+}
 }
 
 
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 345fd6b7..b3e1bea7 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -2,6 +2,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 
 namespace caffe {
@@ -27,6 +28,11 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top){
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ThresholdForward(count, threshold_, bottom_data, top_data);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl
new file mode 100644
index 00000000..eb952e6f
--- /dev/null
+++ b/src/caffe/ocl/sigmoid_layer.cl
@@ -0,0 +1,46 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void SigmoidForward(const int count, __global T* in, __global T* out){
+	int index = get_global_id(0);
+	if(index < count)
+		out[index] = 1. / (1. + exp(-in[index]));
+}
+
+template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out);
+
+template <class T>
+__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){
+	int index = get_global_id(0);
+        const T sigmoid_x = out_data[index];
+        if(index < count)
+		out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
+}
+
+template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
+template __attribute__ ((mangled_name(SigmoidBackward_double))) __kernel void SigmoidBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff);
diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl
new file mode 100644
index 00000000..2f0a08c6
--- /dev/null
+++ b/src/caffe/ocl/tanh_layer.cl
@@ -0,0 +1,46 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void TanHForward(const int count, __global T* in, __global T* out){
+	int index = get_global_id(0);
+	if(index < count)
+		out[index] =tanh(in[index]);
+}
+
+template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out);
+
+template <class T>
+__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){
+	int index = get_global_id(0);
+        const T tanhx = out_data[index];
+        if(index < count)
+		out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
+}
+
+template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
+template __attribute__ ((mangled_name(TanHBackward_double))) __kernel void TanHBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff);
diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl
new file mode 100644
index 00000000..40d55f1c
--- /dev/null
+++ b/src/caffe/ocl/threshold_layer.cl
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out){
+	int index = get_global_id(0);
+	if(index < count)
+		out[index] =in[index] > threshold ? 1 : 0;
+}
+
+template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(ThresholdForward_double))) __kernel void ThresholdForward(const int count, const double threshold, __global double* in, __global double* out);
+
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 787f2b16..fb531590 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -32,6 +32,7 @@
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
+#include "caffe/util/ocl_util.hpp"
 
 static const clblasOrder order = clblasColumnMajor;
 #define pi 3.1415926
@@ -659,16 +660,12 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 
 template <>
 void caffe_gpu_set(const int N, const float alpha, float* Y) {
-  if (alpha == 0) {
-    return;
-  }
+  ocl_memset(Y, alpha, N);
 }
 
 template <>
 void caffe_gpu_set(const int N, const double alpha, double* Y) {
-  if (alpha == 0) {
-    return;
-  }
+  ocl_memset(Y, alpha, N);
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index b479ddff..c8110c00 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -923,6 +923,96 @@ void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_da
 template void ReLUBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
 template void ReLUBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
 
+template <typename Dtype> 
+void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data){
+    std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SigmoidForward<float>(const int count, const float* bottom_data, float* top_data);
+template void SigmoidForward<double>(const int count, const double* bottom_data, double* top_data);
+
+template <typename Dtype> 
+void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){
+    std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+template void SigmoidBackward<float>(const int count, const float* top_diff, const float* top_data, float* bottom_diff);
+template void SigmoidBackward<double>(const int count, const double* top_diff, const double* top_data, double* bottom_diff);
+
+template <typename Dtype> 
+void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data){
+    std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&threshold);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void ThresholdForward<float>(const int count, const float threshold, const float* bottom_data, float* top_data);
+template void ThresholdForward<double>(const int count, const double threshold, const double* bottom_data, double* top_data);
+
+template <typename Dtype> 
+void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data){
+    std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void TanHForward<float>(const int count, const float* bottom_data, float* top_data);
+template void TanHForward<double>(const int count, const double* bottom_data, double* top_data);
+
+template <typename Dtype> 
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){
+    std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {(size_t)count};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+template void TanHBackward<float>(const int count, const float* top_diff, const float* top_data, float* bottom_diff);
+template void TanHBackward<double>(const int count, const double* top_diff, const double* top_data, double* bottom_diff);
+
 template <typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
     const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {

From 8266a0afc5f98393117d9977c4941facdd58b8a3 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Tue, 8 Sep 2015 20:45:55 +0800
Subject: [PATCH 064/124] Made my own last porting layers go through unit test

---
 include/caffe/util/math_functions.hpp |  2 +-
 src/caffe/layers/concat_layer.cpp     | 16 +++++++++-------
 src/caffe/layers/eltwise_layer.cpp    |  4 ++--
 src/caffe/layers/log_layer.cpp        |  4 ++--
 src/caffe/ocl/relu_layer.cl           |  5 +++--
 src/caffe/util/math_functions.cpp     | 26 ++++++++++++++++++++++----
 6 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 46949ff3..7f398153 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -122,7 +122,7 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) {
 
 inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 #ifndef CPU_ONLY
-  ocl_memset((int*)X, alpha, N);
+  ocl_memset((int*)X, (alpha<<24)|(alpha<<16)|(alpha<<8)|alpha, N);
 #else
   NO_GPU;
 #endif
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index d1d0e927..6bc8f9e9 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -91,6 +91,7 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1) { return; }
   Dtype* top_data = top[0]->mutable_gpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -109,23 +110,24 @@ void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bottom.size() == 1) { return; }
   const Dtype* top_diff = top[0]->gpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
   const bool kForward = false;
   for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    if (propagate_down[i]) {
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+      const int nthreads = bottom_concat_size * num_concats_;
+      Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    }
     offset_concat_axis += bottom_concat_axis;
   }
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(ConcatLayer);
 #endif
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 61417f8c..5a7e5e74 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -204,7 +204,7 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           for (int j = 0; j < bottom.size(); ++j) {
             if (i == j) { continue; }
             if (!initialized) {
-              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
+              caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
               initialized = true;
             } else {
               caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
@@ -218,7 +218,7 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         break;
       case EltwiseParameter_EltwiseOp_SUM:
         if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
+          caffe_gpu_copy(count, top_diff, bottom_diff);
         } else {
           caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
         }
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 461fd9bf..268c5f5b 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -86,7 +86,7 @@ void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
     caffe_gpu_log(count, bottom_data, top_data);
   } else {
-    caffe_copy(count, bottom_data, top_data);
+    caffe_gpu_copy(count, bottom_data, top_data);
     if (input_scale_ != Dtype(1)) {
       caffe_gpu_scal(count, input_scale_, top_data);
     }
@@ -108,7 +108,7 @@ void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->gpu_data();
     const Dtype* top_diff = top[0]->gpu_diff();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, bottom_data, bottom_diff);
+    caffe_gpu_copy(count, bottom_data, bottom_diff);
     if (input_scale_ != Dtype(1)) {
       caffe_gpu_scal(count, input_scale_, bottom_diff);
     }
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index d3b36a34..b7865838 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -37,8 +37,9 @@ template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUFo
 template <class T>
 __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
 	int index = get_global_id(0);
-        if(index < count)
-		out_diff[index] = in_diff[index] * (in_data[index] > 0)+(in_data[index] <= 0) * negative_slope;
+        if(index < count) {
+            out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
+        }
 }
 
 template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index fb531590..bb03b980 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -178,7 +178,7 @@ void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
     const double* x, size_t offx, const double beta, int incx,
     double* y, size_t offy, int incy) {
     clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 
 }
 
@@ -187,12 +187,20 @@ template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const float alpha, const float* A, const float* x,
     const float beta, float* y) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA,
+                                  M, N, (cl_float)alpha, (cl_mem)A, 0, N,
+                                  (cl_mem)x, 0, 1, (cl_float)beta,
+                                  (cl_mem)y, 0, 1,
+                                  1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const double alpha, const double* A, const double* x,
     const double beta, double* y) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, 0, N, (cl_mem)x, 0, 1, (cl_double)beta, (cl_mem)y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 }
 
 template <>
@@ -283,11 +291,20 @@ void caffe_copy<double>(const int N, const double* X, double* Y) {
   cblas_dcopy(N, X, 1, Y, 1);
 }
 
+//template <typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
 {
-   OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
-   clFinish(amdDevice.CommandQueue);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)X, CL_TRUE, 0, N, Y,0, NULL, NULL);  
+// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
 }
+/*
+template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
+template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
+template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
+template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
+template void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y);
+template void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y);
+*/
 
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
@@ -547,7 +564,7 @@ void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
     float* out) {
     cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL);
     cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL);
-    clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
+    clblasSdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
     clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL);
     clReleaseMemObject(scratchBuff);
     clReleaseMemObject(d_out);
@@ -721,6 +738,7 @@ void caffe_gpu_mul<float>(const int N, const float* a,
 template <>
 void caffe_gpu_mul<double>(const int N, const double* a,
     const double* b, double* y) {
+  kernel_mul(N, a, b, y);
 }
 
 template <>

From c37410b93408dc733d14102037fba2f342bb3c24 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 8 Sep 2015 17:06:24 -0700
Subject: [PATCH 065/124] fix bug in PReLU layer

---
 include/caffe/common.hpp                |  2 +-
 include/caffe/util/math_functions.hpp   |  3 +++
 include/caffe/util/ocl_wrapper.hpp      |  2 +-
 src/caffe/layers/dropout_layer.cpp      |  2 +-
 src/caffe/layers/prelu_layer.cpp        |  6 +++---
 src/caffe/layers/softmax_loss_layer.cpp |  4 ++--
 src/caffe/ocl/prelu_layer.cl            |  8 +++++---
 src/caffe/util/math_functions.cpp       | 12 ++++++++++--
 src/caffe/util/ocl_wrapper.cpp          | 12 +++++++-----
 9 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index c5bf909d..b84672aa 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -84,7 +84,7 @@ private:\
 #define use_packing_scheme 1 
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-#define global_packing_N 16
+#define global_packing_N 100
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 7f398153..b32760aa 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -130,6 +130,9 @@ inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
 
+template <typename Dtype>
+void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y);
+
 template <typename Dtype>
 void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 0d5f4b2e..a15b68ff 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -125,7 +125,7 @@ template <typename Dtype>
 void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor);
 
 template <typename Dtype> 
-void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff);
+void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data,const int offset_in, Dtype* bottom_diff);
 
 template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 75585a5f..4175a2b7 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -17,7 +17,7 @@ void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
 
 template <typename Dtype>
 DropoutLayer<Dtype>::~DropoutLayer(){
-//   OCL_CHECK( clReleaseMemObject(MaskMem) );
+   OCL_CHECK( clReleaseMemObject(MaskMem) );
 }
 
 
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index ed51ac5e..426a0cad 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -141,7 +141,7 @@ void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const int div_factor = channel_shared_ ? channels : 1;
   
   if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
+    caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
   }
   PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor);
 }
@@ -171,8 +171,8 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       // compute element-wise diff
       // NOLINT_NEXT_LINE(whitespace/operators)
       PReLUParamBackward(
-          cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n),
+          cdim, top_diff, top[0]->offset(n),
+          bottom_data, bottom[0]->offset(n),
           backward_buff_.mutable_gpu_diff());
       if (channel_shared_) {
         Dtype d;
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index d8db1797..66ac9ea5 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -177,8 +177,8 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     const Dtype* prob_data = prob_.gpu_data();
     const Dtype* top_data = top[0]->gpu_data();
-   // caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
+    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
     const Dtype* label = bottom[1]->gpu_data();
     const int dim = prob_.count() / outer_num_;
     const int nthreads = outer_num_ * inner_num_;
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index be85a2e4..6a45ea03 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -48,11 +48,13 @@ template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLU
 template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
 
 template <class T>
-__kernel void PReLUParamBackward(const int count, __global T* in_diff, __global T* in_data, __global T* out_diff) {
+__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) {
   int index = get_global_id(0);
   if(index < count){
+    in_diff += offset_out;
+    out_diff +=  offset_in;
     out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
   }
 }
-template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff);
-template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out,  __global float* in_data, const int offset_in, __global float* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index bb03b980..ed71edf6 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -33,6 +33,7 @@
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
 #include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 static const clblasOrder order = clblasColumnMajor;
 #define pi 3.1415926
@@ -302,9 +303,16 @@ template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
 template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
 template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
 template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
-template void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y);
-template void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y);
 */
+template<> 
+void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y)
+{  OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+}
+
+template<> 
+void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y)
+{  OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+}
 
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index c8110c00..c8f28426 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -869,20 +869,22 @@ template void PReLUBackward<float>(const int count, const int channels, const in
 template void PReLUBackward<double>(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor);
 
 template <typename Dtype> 
-void PReLUParamBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff){
+void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data, const int offset_in, Dtype* bottom_diff){
     std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
     cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
     cl_int ret;
     ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
     ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
+    ret  = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&offset_out);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data);
+    ret  = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&offset_in);
+    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff);
     size_t Global_Work_Size[] = {(size_t)count};
     size_t Local_Work_Size[] = {256};
     OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void PReLUParamBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff);
-template void PReLUParamBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff);
+template void PReLUParamBackward<float>(const int count, const float* top_diff, const int offset_out, const float* bottom_data, const int offset_in, float* bottom_diff);
+template void PReLUParamBackward<double>(const int count, const double* top_diff, const int offset_out, const double* bottom_data, const int offset_in, double* bottom_diff);
 
 
 template <typename Dtype> 

From 454d6761dfd63f7f88648926659a581812daa8a6 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Tue, 8 Sep 2015 20:57:53 -0700
Subject: [PATCH 066/124] modify conv layers

---
 src/caffe/layers/base_conv_layer.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 26787393..f77507d9 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -292,10 +292,13 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
     const Dtype* weight, Dtype* output, bool skip_im2col) {
   cl_command_queue Queue;
+  const Dtype* col_buff = input;
   if (!is_1x1_) {
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
     }   
+    col_buff = col_buffer_.gpu_data();
+    caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem);
   }
 #ifdef multiQ
     for (int g = 0; g < group_; ++g) {
@@ -363,8 +366,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
     const Dtype* weights, Dtype* input) {
   cl_command_queue Queue;
   if (is_1x1_) {
-    int count = height_ * width_ * conv_in_channels_ * opt_num2;
-    caffe_gpu_copy(count, input, (Dtype*)transMem);
+    caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem);
   }
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -387,6 +389,8 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 
   if (!is_1x1_) {
       conv_col2im_gpu_opt(input);
+   }else{
+     caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), (Dtype*)transMem, input);
    }
 }
 
@@ -411,10 +415,11 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   cl_command_queue Queue;
   if (!is_1x1_) {
     conv_im2col_gpu_opt(input);
-  }
+  }else{
+    caffe_gpu_memcpy( K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem);
+ }
     opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
 
-
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
        if(g == 0) Queue = amdDevice.CommandQueue;

From c8e5b9f6d9403890426ec07ced403582cb8e19ef Mon Sep 17 00:00:00 2001
From: Yibing <yuan.gao@noplz.name>
Date: Wed, 9 Sep 2015 13:33:57 +0800
Subject: [PATCH 067/124] Pass HDF5 layers unit test

---
 src/caffe/layers/hdf5_data_layer.cpp   | 7 ++++---
 src/caffe/layers/hdf5_output_layer.cpp | 6 ++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index dda29aee..af223c0f 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -182,9 +182,10 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
-      caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
+      OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[j]->mutable_gpu_data(), CL_TRUE, i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], 0, NULL, NULL) ); 
+      //caffe_copy(data_dim,
+      //    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+      //      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
     }
   }
 }
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index bd608e86..e2bd8e4c 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -80,10 +80,8 @@ void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
   for (int i = 0; i < bottom[0]->num(); ++i) {
-    caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim],
-        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-    caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim],
-        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
+    OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[0]->gpu_data(), CL_TRUE, i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
+    OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[1]->gpu_data(), CL_TRUE, i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL));
   }
   SaveBlobs();
 }

From 8f700e8c2cd4791a99d772cbd5e2061e45d3796b Mon Sep 17 00:00:00 2001
From: Yibing <yuan.gao@noplz.name>
Date: Wed, 9 Sep 2015 15:09:52 +0800
Subject: [PATCH 068/124] minor fix

---
 include/caffe/common.hpp             | 4 ++--
 src/caffe/layers/base_conv_layer.cpp | 1 +
 src/caffe/util/math_functions.cpp    | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index b84672aa..ac954a0e 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -81,10 +81,10 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1 
+#define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
   for intial design, we use the same packing number for all conv layers*/
-#define global_packing_N 100
+#define global_packing_N 16
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index f77507d9..394fd9a5 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -298,6 +298,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
       conv_im2col_gpu_opt(input);
     }   
     col_buff = col_buffer_.gpu_data();
+   }else{
     caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem);
   }
 #ifdef multiQ
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index ed71edf6..80843191 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -684,12 +684,12 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
 template <>
-void caffe_gpu_set(const int N, const float alpha, float* Y) {
+void caffe_gpu_set<float>(const int N, const float alpha, float* Y) {
   ocl_memset(Y, alpha, N);
 }
 
 template <>
-void caffe_gpu_set(const int N, const double alpha, double* Y) {
+void caffe_gpu_set<double>(const int N, const double alpha, double* Y) {
   ocl_memset(Y, alpha, N);
 }
 

From 8166acfa8400fe3e48dff67a596e87b6029505c7 Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao@noplz.name>
Date: Wed, 9 Sep 2015 15:49:41 +0800
Subject: [PATCH 069/124] Format the code

---
 include/caffe/blob.hpp                        |  490 +--
 include/caffe/common.hpp                      |  141 +-
 include/caffe/common_layers.hpp               | 1050 +++---
 include/caffe/data_layers.hpp                 |  574 +--
 include/caffe/data_transformer.hpp            |  258 +-
 include/caffe/device.hpp                      |   78 +-
 include/caffe/filler.hpp                      |  425 +--
 include/caffe/internal_thread.hpp             |   33 +-
 include/caffe/layer.hpp                       |  879 ++---
 include/caffe/layer_factory.hpp               |  109 +-
 include/caffe/loss_layers.hpp                 | 1074 +++---
 include/caffe/net.hpp                         |  467 +--
 include/caffe/neuron_layers.hpp               | 1336 +++----
 include/caffe/python_layer.hpp                |   93 +-
 include/caffe/solver.hpp                      |  274 +-
 include/caffe/syncedmem.hpp                   |   94 +-
 include/caffe/test/test_caffe_main.hpp        |   55 +-
 .../caffe/test/test_gradient_check_util.hpp   |  436 +--
 include/caffe/util/benchmark.hpp              |   75 +-
 include/caffe/util/cudnn.hpp                  |  212 +-
 include/caffe/util/db.hpp                     |   65 +-
 include/caffe/util/db_leveldb.hpp             |  117 +-
 include/caffe/util/db_lmdb.hpp                |  141 +-
 include/caffe/util/im2col.hpp                 |  123 +-
 include/caffe/util/insert_splits.hpp          |    8 +-
 include/caffe/util/io.hpp                     |  111 +-
 include/caffe/util/math_functions.hpp         |  210 +-
 include/caffe/util/mkl_alternate.hpp          |   16 +-
 include/caffe/util/ocl_util.hpp               |    5 +-
 include/caffe/util/ocl_wrapper.hpp            |  421 ++-
 include/caffe/util/rng.hpp                    |   33 +-
 include/caffe/util/upgrade_proto.hpp          |   10 +-
 include/caffe/vision_layers.hpp               |  942 ++---
 src/caffe/blob.cpp                            |  754 ++--
 src/caffe/common.cpp                          |  149 +-
 src/caffe/data_transformer.cpp                |  931 +++--
 src/caffe/device.cpp                          |  717 ++--
 src/caffe/internal_thread.cpp                 |   41 +-
 src/caffe/layer_factory.cpp                   |  193 +-
 src/caffe/layers/absval_layer.cpp             |   74 +-
 src/caffe/layers/accuracy_layer.cpp           |  132 +-
 src/caffe/layers/argmax_layer.cpp             |   84 +-
 src/caffe/layers/base_conv_layer.cpp          |  749 ++--
 src/caffe/layers/base_data_layer.cpp          |  165 +-
 src/caffe/layers/bnll_layer.cpp               |   89 +-
 src/caffe/layers/concat_layer.cpp             |  220 +-
 src/caffe/layers/contrastive_loss_layer.cpp   |  312 +-
 src/caffe/layers/conv_layer.cpp               |  396 +--
 src/caffe/layers/data_layer.cpp               |  186 +-
 src/caffe/layers/deconv_layer.cpp             |  209 +-
 src/caffe/layers/dropout_layer.cpp            |  180 +-
 src/caffe/layers/dummy_data_layer.cpp         |  196 +-
 src/caffe/layers/eltwise_layer.cpp            |  429 +--
 src/caffe/layers/euclidean_loss_layer.cpp     |  118 +-
 src/caffe/layers/exp_layer.cpp                |  144 +-
 src/caffe/layers/filter_layer.cpp             |  302 +-
 src/caffe/layers/flatten_layer.cpp            |   50 +-
 src/caffe/layers/hdf5_data_layer.cpp          |  337 +-
 src/caffe/layers/hdf5_output_layer.cpp        |  128 +-
 src/caffe/layers/hinge_loss_layer.cpp         |  118 +-
 src/caffe/layers/im2col_layer.cpp             |  174 +-
 src/caffe/layers/image_data_layer.cpp         |  254 +-
 src/caffe/layers/infogain_loss_layer.cpp      |  169 +-
 src/caffe/layers/inner_product_layer.cpp      |  278 +-
 src/caffe/layers/log_layer.cpp                |  210 +-
 src/caffe/layers/loss_layer.cpp               |   26 +-
 src/caffe/layers/lrn_layer.cpp                |  540 +--
 src/caffe/layers/memory_data_layer.cpp        |  182 +-
 .../multinomial_logistic_loss_layer.cpp       |   88 +-
 src/caffe/layers/mvn_layer.cpp                |  459 +--
 src/caffe/layers/neuron_layer.cpp             |    8 +-
 src/caffe/layers/pooling_layer.cpp            |  751 ++--
 src/caffe/layers/power_layer.cpp              |  294 +-
 src/caffe/layers/prelu_layer.cpp              |  356 +-
 src/caffe/layers/reduction_layer.cpp          |  362 +-
 src/caffe/layers/relu_layer.cpp               |   89 +-
 src/caffe/layers/reshape_layer.cpp            |  161 +-
 .../sigmoid_cross_entropy_loss_layer.cpp      |  145 +-
 src/caffe/layers/sigmoid_layer.cpp            |   83 +-
 src/caffe/layers/silence_layer.cpp            |   42 +-
 src/caffe/layers/slice_layer.cpp              |  196 +-
 src/caffe/layers/softmax_layer.cpp            |  249 +-
 src/caffe/layers/softmax_loss_layer.cpp       |  339 +-
 src/caffe/layers/split_layer.cpp              |  119 +-
 src/caffe/layers/spp_layer.cpp                |  321 +-
 src/caffe/layers/tanh_layer.cpp               |   82 +-
 src/caffe/layers/threshold_layer.cpp          |   43 +-
 src/caffe/layers/window_data_layer.cpp        |  783 ++--
 src/caffe/net.cpp                             | 1624 ++++-----
 src/caffe/ocl/bnll_layer.cl                   |   24 +-
 src/caffe/ocl/concat_layer.cl                 |   48 +-
 src/caffe/ocl/contrastive_loss_layer.cl       |   64 +-
 src/caffe/ocl/dropout_layer.cl                |   21 +-
 src/caffe/ocl/eltwise_layer.cl                |   72 +-
 src/caffe/ocl/im2col.cl                       |  441 ++-
 src/caffe/ocl/lrn_layer.cl                    |  194 +-
 src/caffe/ocl/pooling_layer.cl                |  460 +--
 src/caffe/ocl/prelu_layer.cl                  |   36 +-
 src/caffe/ocl/random.cl                       |  373 +-
 src/caffe/ocl/relu_layer.cl                   |   12 +-
 src/caffe/ocl/sigmoid_layer.cl                |   12 +-
 src/caffe/ocl/softmax_layer.cl                |  204 +-
 src/caffe/ocl/softmaxwithloss_layer.cl        |  120 +-
 src/caffe/ocl/tanh_layer.cl                   |   12 +-
 src/caffe/ocl/threshold_layer.cl              |    4 +-
 src/caffe/ocl/util.cl                         |  185 +-
 src/caffe/solver.cpp                          | 1373 ++++----
 src/caffe/syncedmem.cpp                       |  201 +-
 src/caffe/util/benchmark.cpp                  |  143 +-
 src/caffe/util/cudnn.cpp                      |   28 +-
 src/caffe/util/db.cpp                         |   33 +-
 src/caffe/util/db_leveldb.cpp                 |   23 +-
 src/caffe/util/db_lmdb.cpp                    |   57 +-
 src/caffe/util/im2col.cpp                     |  612 ++--
 src/caffe/util/im2col.cu                      |  201 +-
 src/caffe/util/insert_splits.cpp              |  240 +-
 src/caffe/util/io.cpp                         |  412 +--
 src/caffe/util/math_functions.cpp             | 1082 +++---
 src/caffe/util/math_functions.cu              |  511 +--
 src/caffe/util/ocl_util.cpp                   |   79 +-
 src/caffe/util/ocl_wrapper.cpp                | 3133 ++++++++++-------
 src/caffe/util/upgrade_proto.cpp              | 1740 ++++-----
 122 files changed, 19862 insertions(+), 18473 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 12854689..e55ce8e6 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -21,262 +21,282 @@ namespace caffe {
  *
  * TODO(dox): more thorough description.
  */
-template <typename Dtype>
+template<typename Dtype>
 class Blob {
- public:
-  Blob()
-       : data_(), diff_(), count_(0), capacity_(0) {}
+	public:
+		Blob()
+			: data_(), diff_(), count_(0), capacity_(0) {
+		}
 
-  /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
-  explicit Blob(const int num, const int channels, const int height,
-      const int width);
-  explicit Blob(const vector<int>& shape);
+		/// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
+		explicit Blob(const int num, const int channels, const int height,
+			const int width);
+		explicit Blob(const vector<int>& shape);
 
-  /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
-  void Reshape(const int num, const int channels, const int height,
-      const int width);
-  /**
-   * @brief Change the dimensions of the blob, allocating new memory if
-   *        necessary.
-   *
-   * This function can be called both to create an initial allocation
-   * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
-   * or Layer::Forward. When changing the size of blob, memory will only be
-   * reallocated if sufficient memory does not already exist, and excess memory
-   * will never be freed.
-   *
-   * Note that reshaping an input blob and immediately calling Net::Backward is
-   * an error; either Net::Forward or Net::Reshape need to be called to
-   * propagate the new input shape to higher layers.
-   */
-  void Reshape(const vector<int>& shape);
-  void Reshape(const BlobShape& shape);
-  void ReshapeLike(const Blob& other);
-  inline string shape_string() const {
-    ostringstream stream;
-    for (int i = 0; i < shape_.size(); ++i) {
-      stream << shape_[i] << " ";
-    }
-    stream << "(" << count_ << ")";
-    return stream.str();
-  }
-  inline const vector<int>& shape() const { return shape_; }
-  /**
-   * @brief Returns the dimension of the index-th axis (or the negative index-th
-   *        axis from the end, if index is negative).
-   *
-   * @param index the axis index, which may be negative as it will be
-   *        "canonicalized" using CanonicalAxisIndex.
-   *        Dies on out of range index.
-   */
-  inline int shape(int index) const {
-    return shape_[CanonicalAxisIndex(index)];
-  }
-  inline int num_axes() const { return shape_.size(); }
-  inline int count() const { return count_; }
+		/// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
+		void Reshape(const int num, const int channels, const int height,
+			const int width);
+		/**
+		 * @brief Change the dimensions of the blob, allocating new memory if
+		 *        necessary.
+		 *
+		 * This function can be called both to create an initial allocation
+		 * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
+		 * or Layer::Forward. When changing the size of blob, memory will only be
+		 * reallocated if sufficient memory does not already exist, and excess memory
+		 * will never be freed.
+		 *
+		 * Note that reshaping an input blob and immediately calling Net::Backward is
+		 * an error; either Net::Forward or Net::Reshape need to be called to
+		 * propagate the new input shape to higher layers.
+		 */
+		void Reshape(const vector<int>& shape);
+		void Reshape(const BlobShape& shape);
+		void ReshapeLike(const Blob& other);
+		inline string shape_string() const {
+			ostringstream stream;
+			for (int i = 0; i < shape_.size(); ++i) {
+				stream << shape_[i] << " ";
+			}
+			stream << "(" << count_ << ")";
+			return stream.str();
+		}
+		inline const vector<int>& shape() const {
+			return shape_;
+		}
+		/**
+		 * @brief Returns the dimension of the index-th axis (or the negative index-th
+		 *        axis from the end, if index is negative).
+		 *
+		 * @param index the axis index, which may be negative as it will be
+		 *        "canonicalized" using CanonicalAxisIndex.
+		 *        Dies on out of range index.
+		 */
+		inline int shape(int index) const {
+			return shape_[CanonicalAxisIndex(index)];
+		}
+		inline int num_axes() const {
+			return shape_.size();
+		}
+		inline int count() const {
+			return count_;
+		}
 
-  /**
-   * @brief Compute the volume of a slice; i.e., the product of dimensions
-   *        among a range of axes.
-   *
-   * @param start_axis The first axis to include in the slice.
-   *
-   * @param end_axis The first axis to exclude from the slice.
-   */
-  inline int count(int start_axis, int end_axis) const {
-    CHECK_LE(start_axis, end_axis);
-    CHECK_GE(start_axis, 0);
-    CHECK_GE(end_axis, 0);
-    CHECK_LE(start_axis, num_axes());
-    CHECK_LE(end_axis, num_axes());
-    int count = 1;
-    for (int i = start_axis; i < end_axis; ++i) {
-      count *= shape(i);
-    }
-    return count;
-  }
-  /**
-   * @brief Compute the volume of a slice spanning from a particular first
-   *        axis to the final axis.
-   *
-   * @param start_axis The first axis to include in the slice.
-   */
-  inline int count(int start_axis) const {
-    return count(start_axis, num_axes());
-  }
+		/**
+		 * @brief Compute the volume of a slice; i.e., the product of dimensions
+		 *        among a range of axes.
+		 *
+		 * @param start_axis The first axis to include in the slice.
+		 *
+		 * @param end_axis The first axis to exclude from the slice.
+		 */
+		inline int count(int start_axis, int end_axis) const {
+			CHECK_LE(start_axis, end_axis);
+			CHECK_GE(start_axis, 0);
+			CHECK_GE(end_axis, 0);
+			CHECK_LE(start_axis, num_axes());
+			CHECK_LE(end_axis, num_axes());
+			int count = 1;
+			for (int i = start_axis; i < end_axis; ++i) {
+				count *= shape(i);
+			}
+			return count;
+		}
+		/**
+		 * @brief Compute the volume of a slice spanning from a particular first
+		 *        axis to the final axis.
+		 *
+		 * @param start_axis The first axis to include in the slice.
+		 */
+		inline int count(int start_axis) const {
+			return count(start_axis, num_axes());
+		}
 
-  /**
-   * @brief Returns the 'canonical' version of a (usually) user-specified axis,
-   *        allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param index the axis index.
-   *        If 0 <= index < num_axes(), return index.
-   *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
-   *        e.g., the last axis index (num_axes() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int CanonicalAxisIndex(int axis_index) const {
-    CHECK_GE(axis_index, -num_axes())
-        << "axis " << axis_index << " out of range for " << num_axes()
-        << "-D Blob with shape " << shape_string();
-    CHECK_LT(axis_index, num_axes())
-        << "axis " << axis_index << " out of range for " << num_axes()
-        << "-D Blob with shape " << shape_string();
-    if (axis_index < 0) {
-      return axis_index + num_axes();
-    }
-    return axis_index;
-  }
+		/**
+		 * @brief Returns the 'canonical' version of a (usually) user-specified axis,
+		 *        allowing for negative indexing (e.g., -1 for the last axis).
+		 *
+		 * @param index the axis index.
+		 *        If 0 <= index < num_axes(), return index.
+		 *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
+		 *        e.g., the last axis index (num_axes() - 1) if index == -1,
+		 *        the second to last if index == -2, etc.
+		 *        Dies on out of range index.
+		 */
+		inline int CanonicalAxisIndex(int axis_index) const {
+			CHECK_GE(axis_index, -num_axes())
+				<< "axis " << axis_index << " out of range for " << num_axes()
+				<< "-D Blob with shape " << shape_string();
+			CHECK_LT(axis_index, num_axes())
+				<< "axis " << axis_index << " out of range for " << num_axes()
+				<< "-D Blob with shape " << shape_string();
+			if (axis_index < 0) {
+				return axis_index + num_axes();
+			}
+			return axis_index;
+		}
 
-  /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
-  inline int num() const { return LegacyShape(0); }
-  /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
-  inline int channels() const { return LegacyShape(1); }
-  /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
-  inline int height() const { return LegacyShape(2); }
-  /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
-  inline int width() const { return LegacyShape(3); }
-  inline int LegacyShape(int index) const {
-    CHECK_LE(num_axes(), 4)
-        << "Cannot use legacy accessors on Blobs with > 4 axes.";
-    CHECK_LT(index, 4);
-    CHECK_GE(index, -4);
-    if (index >= num_axes() || index < -num_axes()) {
-      // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
-      // indexing) -- this special case simulates the one-padding used to fill
-      // extraneous axes of legacy blobs.
-      return 1;
-    }
-    return shape(index);
-  }
+		/// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+		inline int num() const {
+			return LegacyShape(0);
+		}
+		/// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+		inline int channels() const {
+			return LegacyShape(1);
+		}
+		/// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+		inline int height() const {
+			return LegacyShape(2);
+		}
+		/// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+		inline int width() const {
+			return LegacyShape(3);
+		}
+		inline int LegacyShape(int index) const {
+			CHECK_LE(num_axes(), 4)
+				<< "Cannot use legacy accessors on Blobs with > 4 axes.";
+			CHECK_LT(index, 4);
+			CHECK_GE(index, -4);
+			if (index >= num_axes() || index < -num_axes()) {
+				// Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+				// indexing) -- this special case simulates the one-padding used to fill
+				// extraneous axes of legacy blobs.
+				return 1;
+			}
+			return shape(index);
+		}
 
-  inline int offset(const int n, const int c = 0, const int h = 0,
-      const int w = 0) const {
-    CHECK_GE(n, 0);
-    CHECK_LE(n, num());
-    CHECK_GE(channels(), 0);
-    CHECK_LE(c, channels());
-    CHECK_GE(height(), 0);
-    CHECK_LE(h, height());
-    CHECK_GE(width(), 0);
-    CHECK_LE(w, width());
-    return ((n * channels() + c) * height() + h) * width() + w;
-  }
+		inline int offset(const int n, const int c = 0, const int h = 0,
+			const int w = 0) const {
+			CHECK_GE(n, 0);
+			CHECK_LE(n, num());
+			CHECK_GE(channels(), 0);
+			CHECK_LE(c, channels());
+			CHECK_GE(height(), 0);
+			CHECK_LE(h, height());
+			CHECK_GE(width(), 0);
+			CHECK_LE(w, width());
+			return ((n * channels() + c) * height() + h) * width() + w;
+		}
 
-  inline int offset(const vector<int>& indices) const {
-    CHECK_LE(indices.size(), num_axes());
-    int offset = 0;
-    for (int i = 0; i < num_axes(); ++i) {
-      offset *= shape(i);
-      if (indices.size() > i) {
-        CHECK_GE(indices[i], 0);
-        CHECK_LT(indices[i], shape(i));
-        offset += indices[i];
-      }
-    }
-    return offset;
-  }
-  /**
-   * @brief Copy from a source Blob.
-   *
-   * @param source the Blob to copy from
-   * @param copy_diff if false, copy the data; if true, copy the diff
-   * @param reshape if false, require this Blob to be pre-shaped to the shape
-   *        of other (and die otherwise); if true, Reshape this Blob to other's
-   *        shape if necessary
-   */
-  void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
-      bool reshape = false);
+		inline int offset(const vector<int>& indices) const {
+			CHECK_LE(indices.size(), num_axes());
+			int offset = 0;
+			for (int i = 0; i < num_axes(); ++i) {
+				offset *= shape(i);
+				if (indices.size() > i) {
+					CHECK_GE(indices[i], 0);
+					CHECK_LT(indices[i], shape(i));
+					offset += indices[i];
+				}
+			}
+			return offset;
+		}
+		/**
+		 * @brief Copy from a source Blob.
+		 *
+		 * @param source the Blob to copy from
+		 * @param copy_diff if false, copy the data; if true, copy the diff
+		 * @param reshape if false, require this Blob to be pre-shaped to the shape
+		 *        of other (and die otherwise); if true, Reshape this Blob to other's
+		 *        shape if necessary
+		 */
+		void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
+			bool reshape = false);
 
-  inline Dtype data_at(const int n, const int c, const int h,
-      const int w) const {
-    return cpu_data()[offset(n, c, h, w)];
-  }
+		inline Dtype data_at(const int n, const int c, const int h,
+			const int w) const {
+			return cpu_data()[offset(n, c, h, w)];
+		}
 
-  inline Dtype diff_at(const int n, const int c, const int h,
-      const int w) const {
-    return cpu_diff()[offset(n, c, h, w)];
-  }
+		inline Dtype diff_at(const int n, const int c, const int h,
+			const int w) const {
+			return cpu_diff()[offset(n, c, h, w)];
+		}
 
-  inline Dtype data_at(const vector<int>& index) const {
-    return cpu_data()[offset(index)];
-  }
+		inline Dtype data_at(const vector<int>& index) const {
+			return cpu_data()[offset(index)];
+		}
 
-  inline Dtype diff_at(const vector<int>& index) const {
-    return cpu_diff()[offset(index)];
-  }
+		inline Dtype diff_at(const vector<int>& index) const {
+			return cpu_diff()[offset(index)];
+		}
 
-  inline const shared_ptr<SyncedMemory>& data() const {
-    CHECK(data_);
-    return data_;
-  }
+		inline const shared_ptr<SyncedMemory>& data() const {
+			CHECK(data_);
+			return data_;
+		}
 
-  inline const shared_ptr<SyncedMemory>& diff() const {
-    CHECK(diff_);
-    return diff_;
-  }
+		inline const shared_ptr<SyncedMemory>& diff() const {
+			CHECK(diff_);
+			return diff_;
+		}
 
-  const Dtype* cpu_data() const;
-  void set_cpu_data(Dtype* data);
-  const Dtype* gpu_data() const;
-  const Dtype* gpu_cache_data() const;
-  const Dtype* cpu_diff() const;
-  const Dtype* gpu_diff() const;
-  Dtype* mutable_cpu_data();
-  Dtype* mutable_gpu_data();
-  Dtype* mutable_cpu_diff();
-  Dtype* mutable_gpu_diff();
-  void Update();
-  void FromProto(const BlobProto& proto, bool reshape = true);
-  void ToProto(BlobProto* proto, bool write_diff = false) const;
+		const Dtype* cpu_data() const;
+		void set_cpu_data(Dtype* data);
+		const Dtype* gpu_data() const;
+		const Dtype* gpu_cache_data() const;
+		const Dtype* cpu_diff() const;
+		const Dtype* gpu_diff() const;
+		Dtype* mutable_cpu_data();
+		Dtype* mutable_gpu_data();
+		Dtype* mutable_cpu_diff();
+		Dtype* mutable_gpu_diff();
+		void Update();
+		void FromProto(const BlobProto& proto, bool reshape = true);
+		void ToProto(BlobProto* proto, bool write_diff = false) const;
 
-  /// @brief Compute the sum of absolute values (L1 norm) of the data.
-  Dtype asum_data() const;
-  /// @brief Compute the sum of absolute values (L1 norm) of the diff.
-  Dtype asum_diff() const;
-  /// @brief Compute the sum of squares (L2 norm squared) of the data.
-  Dtype sumsq_data() const;
-  /// @brief Compute the sum of squares (L2 norm squared) of the diff.
-  Dtype sumsq_diff() const;
+		/// @brief Compute the sum of absolute values (L1 norm) of the data.
+		Dtype asum_data() const;
+		/// @brief Compute the sum of absolute values (L1 norm) of the diff.
+		Dtype asum_diff() const;
+		/// @brief Compute the sum of squares (L2 norm squared) of the data.
+		Dtype sumsq_data() const;
+		/// @brief Compute the sum of squares (L2 norm squared) of the diff.
+		Dtype sumsq_diff() const;
 
-  /// @brief Scale the blob data by a constant factor.
-  void scale_data(Dtype scale_factor);
-  /// @brief Scale the blob diff by a constant factor.
-  void scale_diff(Dtype scale_factor);
+		/// @brief Scale the blob data by a constant factor.
+		void scale_data(Dtype scale_factor);
+		/// @brief Scale the blob diff by a constant factor.
+		void scale_diff(Dtype scale_factor);
 
-  /**
-   * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
-   *        data_ of Blob other -- useful in Layer%s which simply perform a copy
-   *        in their Forward pass.
-   *
-   * This deallocates the SyncedMemory holding this Blob's data_, as
-   * shared_ptr calls its destructor when reset with the "=" operator.
-   */
-  void ShareData(const Blob& other);
-  /**
-   * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
-   *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
-   *        in their Forward pass.
-   *
-   * This deallocates the SyncedMemory holding this Blob's diff_, as
-   * shared_ptr calls its destructor when reset with the "=" operator.
-   */
-  void ShareDiff(const Blob& other);
-  void set_data_layer(){data_->set_data_layer(); diff_->set_data_layer();};
+		/**
+		 * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
+		 *        data_ of Blob other -- useful in Layer%s which simply perform a copy
+		 *        in their Forward pass.
+		 *
+		 * This deallocates the SyncedMemory holding this Blob's data_, as
+		 * shared_ptr calls its destructor when reset with the "=" operator.
+		 */
+		void ShareData(const Blob& other);
+		/**
+		 * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
+		 *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
+		 *        in their Forward pass.
+		 *
+		 * This deallocates the SyncedMemory holding this Blob's diff_, as
+		 * shared_ptr calls its destructor when reset with the "=" operator.
+		 */
+		void ShareDiff(const Blob& other);
+		void set_data_layer() {
+			data_->set_data_layer();
+			diff_->set_data_layer();
+		}
+		;
 
-  bool ShapeEquals(const BlobProto& other);
+		bool ShapeEquals(const BlobProto& other);
 
- protected:
-  shared_ptr<SyncedMemory> data_;
-  shared_ptr<SyncedMemory> diff_;
-  vector<int> shape_;
-  int count_;
-  int capacity_;
+	protected:
+		shared_ptr<SyncedMemory> data_;
+		shared_ptr<SyncedMemory> diff_;
+		vector<int> shape_;
+		int count_;
+		int capacity_;
 
-  DISABLE_COPY_AND_ASSIGN(Blob);
-};  // class Blob
+		DISABLE_COPY_AND_ASSIGN (Blob);
+};
+// class Blob
 
-}  // namespace caffe
+}// namespace caffe
 
 #endif  // CAFFE_BLOB_HPP_
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index ac954a0e..0f3a7667 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -73,23 +73,20 @@ private:\
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 //OpenCL:  various of defines to choose the design schemes
 /* ifdef: use CPU random generator in dropout layer
-   ifndef: use GPU randome generator*/
+ ifndef: use GPU randome generator*/
 //#define use_cpu_generator_dropout
-
 //#define print_memory_trace
-
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
 #define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
-  for intial design, we use the same packing number for all conv layers*/
+ for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
 /*ifdef: use multi-command queues for groups in conv layer;
  ifndef: use single commane queue for groups*/
 //#define multiQ
 //#define check_gradient
-
 // OpenCL: various checks for different function calls.
 #define OCL_CHECK(condition) \
   do { \
@@ -156,7 +153,9 @@ do{ \
 }while(0)
 
 // See PR #1236
-namespace cv { class Mat; }
+namespace cv {
+class Mat;
+}
 
 namespace caffe {
 
@@ -186,77 +185,81 @@ void GlobalInit(int* pargc, char*** pargv);
 // A singleton class to hold common caffe stuff, such as the handler that
 // caffe is going to use for cublas, curand, etc.
 class Caffe {
- public:
-  ~Caffe();
-  inline static Caffe& Get() {
-    if (!singleton_.get()) {
-      singleton_.reset(new Caffe());
-    }
-    return *singleton_;
-  }
-  enum Brew { CPU, GPU, APU };
-
-  // This random number generator facade hides boost and CUDA rng
-  // implementation from one another (for cross-platform compatibility).
-  class RNG {
-   public:
-    RNG();
-    explicit RNG(unsigned int seed);
-    explicit RNG(const RNG&);
-    RNG& operator=(const RNG&);
-    void* generator();
-   private:
-    class Generator;
-    shared_ptr<Generator> generator_;
-  };
-
-  // Getters for boost rng, curand, and cublas handles
-  inline static RNG& rng_stream() {
-    if (!Get().random_generator_) {
-      Get().random_generator_.reset(new RNG());
-    }
-    return *(Get().random_generator_);
-  }
+	public:
+		~Caffe();
+		inline static Caffe& Get() {
+			if (!singleton_.get()) {
+				singleton_.reset(new Caffe());
+			}
+			return *singleton_;
+		}
+		enum Brew {
+			CPU, GPU, APU
+		};
+
+		// This random number generator facade hides boost and CUDA rng
+		// implementation from one another (for cross-platform compatibility).
+		class RNG {
+			public:
+				RNG();
+				explicit RNG(unsigned int seed);
+				explicit RNG(const RNG&);
+				RNG& operator=(const RNG&);
+				void* generator();
+				private:
+				class Generator;
+				shared_ptr<Generator> generator_;
+		};
+
+		// Getters for boost rng, curand, and cublas handles
+		inline static RNG& rng_stream() {
+			if (!Get().random_generator_) {
+				Get().random_generator_.reset(new RNG());
+			}
+			return *(Get().random_generator_);
+		}
 #ifndef CPU_ONLY
-  //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  //inline static curandGenerator_t curand_generator() {
-  //  return Get().curand_generator_;
-  //}
+		//inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+		//inline static curandGenerator_t curand_generator() {
+		//  return Get().curand_generator_;
+		//}
 #endif
 
-  // Returns the mode: running on CPU or GPU.
-  inline static Brew mode() { return Get().mode_; }
-  // The setters for the variables
-  // Sets the mode. It is recommended that you don't change the mode halfway
-  // into the program since that may cause allocation of pinned memory being
-  // freed in a non-pinned way, which may cause problems - I haven't verified
-  // it personally but better to note it here in the header file.
-  inline static void set_mode(Brew mode) { 
-    Get().mode_ = mode;
-  }
-  // Sets the random seed of both boost and curand
-  static void set_random_seed(const unsigned int seed);
-  // Sets the device. Since we have cublas and curand stuff, set device also
-  // requires us to reset those values.
-  static void SetDevice(const int device_id);
-  // Prints the current GPU status.
-  static void DeviceQuery();
-
- protected:
+		// Returns the mode: running on CPU or GPU.
+		inline static Brew mode() {
+			return Get().mode_;
+		}
+		// The setters for the variables
+		// Sets the mode. It is recommended that you don't change the mode halfway
+		// into the program since that may cause allocation of pinned memory being
+		// freed in a non-pinned way, which may cause problems - I haven't verified
+		// it personally but better to note it here in the header file.
+		inline static void set_mode(Brew mode) {
+			Get().mode_ = mode;
+		}
+		// Sets the random seed of both boost and curand
+		static void set_random_seed(const unsigned int seed);
+		// Sets the device. Since we have cublas and curand stuff, set device also
+		// requires us to reset those values.
+		static void SetDevice(const int device_id);
+		// Prints the current GPU status.
+		static void DeviceQuery();
+
+	protected:
 #ifndef CPU_ONLY
-  //cublasHandle_t cublas_handle_;
-  //curandGenerator_t curand_generator_;
+		//cublasHandle_t cublas_handle_;
+		//curandGenerator_t curand_generator_;
 #endif
-  shared_ptr<RNG> random_generator_;
+		shared_ptr<RNG> random_generator_;
 
-  Brew mode_;
-  static shared_ptr<Caffe> singleton_;
+		Brew mode_;
+		static shared_ptr<Caffe> singleton_;
 
- private:
-  // The private constructor to avoid duplicate instantiation.
-  Caffe();
+	private:
+		// The private constructor to avoid duplicate instantiation.
+		Caffe();
 
-  DISABLE_COPY_AND_ASSIGN(Caffe);
+	DISABLE_COPY_AND_ASSIGN(Caffe);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index eb77e762..879e84e7 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -25,122 +25,136 @@ namespace caffe {
  *
  * NOTE: does not implement Backwards operation.
  */
-template <typename Dtype>
-class ArgMaxLayer : public Layer<Dtype> {
- public:
-  /**
-   * @param param provides ArgMaxParameter argmax_param,
-   *     with ArgMaxLayer options:
-   *   - top_k (\b optional uint, default 1).
-   *     the number @f$ K @f$ of maximal items to output.
-   *   - out_max_val (\b optional bool, default false).
-   *     if set, output a vector of pairs (max_ind, max_val) for each image.
-   */
-  explicit ArgMaxLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "ArgMax"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
-   *      @f$ (N \times 2 \times K \times 1) @f$
-   *      the computed outputs @f$
-   *       y_n = \arg\max\limits_i x_{ni}
-   *      @f$ (for @f$ K = 1 @f$).
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /// @brief Not implemented (non-differentiable function)
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    NOT_IMPLEMENTED;
-  }
-  bool out_max_val_;
-  size_t top_k_;
+template<typename Dtype>
+class ArgMaxLayer: public Layer<Dtype> {
+	public:
+		/**
+		 * @param param provides ArgMaxParameter argmax_param,
+		 *     with ArgMaxLayer options:
+		 *   - top_k (\b optional uint, default 1).
+		 *     the number @f$ K @f$ of maximal items to output.
+		 *   - out_max_val (\b optional bool, default false).
+		 *     if set, output a vector of pairs (max_ind, max_val) for each image.
+		 */
+		explicit ArgMaxLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "ArgMax";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
+		 *      @f$ (N \times 2 \times K \times 1) @f$
+		 *      the computed outputs @f$
+		 *       y_n = \arg\max\limits_i x_{ni}
+		 *      @f$ (for @f$ K = 1 @f$).
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		/// @brief Not implemented (non-differentiable function)
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+			NOT_IMPLEMENTED;
+		}
+		bool out_max_val_;
+		size_t top_k_;
 };
 
 /**
  * @brief Takes at least two Blob%s and concatenates them along either the num
  *        or channel dimension, outputting the result.
  */
-template <typename Dtype>
-class ConcatLayer : public Layer<Dtype> {
- public:
-  explicit ConcatLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_1 @f$
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_2 @f$
-   *   -# ...
-   *   - K @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_K @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-   *      the concatenated output @f$
-   *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the concatenate inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *        respect to the outputs
-   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to concatenated outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length K), into which the top gradient
-   *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
-   *        inputs @f$
-   *        \left[ \begin{array}{cccc}
-   *          \frac{\partial E}{\partial x_1} &
-   *          \frac{\partial E}{\partial x_2} &
-   *          ... &
-   *          \frac{\partial E}{\partial x_K}
-   *        \end{array} \right] =
-   *        \frac{\partial E}{\partial y}
-   *        @f$
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
-  int num_concats_;
-  int concat_input_size_;
-  int concat_axis_;
+template<typename Dtype>
+class ConcatLayer: public Layer<Dtype> {
+	public:
+		explicit ConcatLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Concat";
+		}
+		virtual inline int MinBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 2+)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x_1 @f$
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x_2 @f$
+		 *   -# ...
+		 *   - K @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x_K @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+		 *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+		 *      the concatenated output @f$
+		 *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *        respect to the outputs
+		 *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+		 *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to concatenated outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length K), into which the top gradient
+		 *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
+		 *        inputs @f$
+		 *        \left[ \begin{array}{cccc}
+		 *          \frac{\partial E}{\partial x_1} &
+		 *          \frac{\partial E}{\partial x_2} &
+		 *          ... &
+		 *          \frac{\partial E}{\partial x_K}
+		 *        \end{array} \right] =
+		 *        \frac{\partial E}{\partial y}
+		 *        @f$
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int count_;
+		int num_concats_;
+		int concat_input_size_;
+		int concat_axis_;
 };
 
 /**
@@ -149,35 +163,42 @@ class ConcatLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class EltwiseLayer : public Layer<Dtype> {
- public:
-  explicit EltwiseLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Eltwise"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  EltwiseParameter_EltwiseOp op_;
-  vector<Dtype> coeffs_;
-  Blob<int> max_idx_;
-
-  bool stable_prod_grad_;
+template<typename Dtype>
+class EltwiseLayer: public Layer<Dtype> {
+	public:
+		explicit EltwiseLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Eltwise";
+		}
+		virtual inline int MinBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		EltwiseParameter_EltwiseOp op_;
+		vector<Dtype> coeffs_;
+		Blob<int> max_idx_;
+
+		bool stable_prod_grad_;
 };
 
 /**
@@ -186,61 +207,68 @@ class EltwiseLayer : public Layer<Dtype> {
  * the corresponding item has to be filtered, non-zero means that corresponding
  * item needs to stay).
  */
-template <typename Dtype>
-class FilterLayer : public Layer<Dtype> {
- public:
-  explicit FilterLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Filter"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs to be filtered @f$ x_1 @f$
-   *   -# ...
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs to be filtered @f$ x_K @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the selector blob
-   * @param top output Blob vector (length 1+)
-   *   -# @f$ (S \times C \times H \times W) @f$ ()
-   *        the filtered output @f$ x_1 @f$
-   *        where S is the number of items
-   *        that haven't been filtered
-   *      @f$ (S \times C \times H \times W) @f$
-   *        the filtered output @f$ x_K @f$
-   *        where S is the number of items
-   *        that haven't been filtered
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the forwarded inputs.
-   *
-   * @param top output Blob vector (length 1+), providing the error gradient with
-   *        respect to the outputs
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2+), into which the top error
-   *        gradient is copied
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool first_reshape_;
-  vector<int> indices_to_forward_;
+template<typename Dtype>
+class FilterLayer: public Layer<Dtype> {
+	public:
+		explicit FilterLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Filter";
+		}
+		virtual inline int MinBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 2+)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs to be filtered @f$ x_1 @f$
+		 *   -# ...
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs to be filtered @f$ x_K @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the selector blob
+		 * @param top output Blob vector (length 1+)
+		 *   -# @f$ (S \times C \times H \times W) @f$ ()
+		 *        the filtered output @f$ x_1 @f$
+		 *        where S is the number of items
+		 *        that haven't been filtered
+		 *      @f$ (S \times C \times H \times W) @f$
+		 *        the filtered output @f$ x_K @f$
+		 *        where S is the number of items
+		 *        that haven't been filtered
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the forwarded inputs.
+		 *
+		 * @param top output Blob vector (length 1+), providing the error gradient with
+		 *        respect to the outputs
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 2+), into which the top error
+		 *        gradient is copied
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		bool first_reshape_;
+		vector<int> indices_to_forward_;
 };
 
 /**
@@ -253,41 +281,48 @@ class FilterLayer : public Layer<Dtype> {
  * and in Backward, the diff pointer of the bottom Blob to that of the top Blob
  * (see Blob::ShareDiff).
  */
-template <typename Dtype>
-class FlattenLayer : public Layer<Dtype> {
- public:
-  explicit FlattenLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Flatten"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times CHW \times 1 \times 1) @f$
-   *      the outputs -- i.e., the (virtually) copied, flattened inputs
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the concatenate inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *        respect to the outputs
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length K), into which the top error
-   *        gradient is (virtually) copied
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class FlattenLayer: public Layer<Dtype> {
+	public:
+		explicit FlattenLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Flatten";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 2+)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times CHW \times 1 \times 1) @f$
+		 *      the outputs -- i.e., the (virtually) copied, flattened inputs
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *        respect to the outputs
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length K), into which the top error
+		 *        gradient is (virtually) copied
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -296,35 +331,42 @@ class FlattenLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class InnerProductLayer : public Layer<Dtype> {
- public:
-  explicit InnerProductLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "InnerProduct"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int M_;
-  int K_;
-  int N_;
-  bool bias_term_;
-  Blob<Dtype> bias_multiplier_;
+template<typename Dtype>
+class InnerProductLayer: public Layer<Dtype> {
+	public:
+		explicit InnerProductLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "InnerProduct";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int M_;
+		int K_;
+		int N_;
+		bool bias_term_;
+		Blob<Dtype> bias_multiplier_;
 };
 
 /**
@@ -332,33 +374,40 @@ class InnerProductLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class MVNLayer : public Layer<Dtype> {
- public:
-  explicit MVNLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MVN"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> mean_, variance_, temp_;
-
-  /// sum_multiplier is used to carry out sum using BLAS
-  Blob<Dtype> sum_multiplier_;
-  Dtype eps_;
+template<typename Dtype>
+class MVNLayer: public Layer<Dtype> {
+	public:
+		explicit MVNLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "MVN";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Blob<Dtype> mean_, variance_, temp_;
+
+		/// sum_multiplier is used to carry out sum using BLAS
+		Blob<Dtype> sum_multiplier_;
+		Dtype eps_;
 };
 
 /*
@@ -367,36 +416,47 @@ class MVNLayer : public Layer<Dtype> {
  * Note: similarly to FlattenLayer, this layer does not change the input values
  * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff).
  */
-template <typename Dtype>
-class ReshapeLayer : public Layer<Dtype> {
- public:
-  explicit ReshapeLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Reshape"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
-  /// @brief vector of axes indices whose dimensions we'll copy from the bottom
-  vector<int> copy_axes_;
-  /// @brief the index of the axis whose dimension we infer, or -1 if none
-  int inferred_axis_;
-  /// @brief the product of the "constant" output dimensions
-  int constant_count_;
+template<typename Dtype>
+class ReshapeLayer: public Layer<Dtype> {
+	public:
+		explicit ReshapeLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Reshape";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+
+		/// @brief vector of axes indices whose dimensions we'll copy from the bottom
+		vector<int> copy_axes_;
+		/// @brief the index of the axis whose dimension we infer, or -1 if none
+		int inferred_axis_;
+		/// @brief the product of the "constant" output dimensions
+		int constant_count_;
 };
 
 /**
@@ -406,71 +466,87 @@ class ReshapeLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class ReductionLayer : public Layer<Dtype> {
- public:
-  explicit ReductionLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Reduction"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// @brief the reduction operation performed by the layer
-  ReductionParameter_ReductionOp op_;
-  /// @brief a scalar coefficient applied to all outputs
-  Dtype coeff_;
-  /// @brief the index of the first input axis to reduce
-  int axis_;
-  /// @brief the number of reductions performed
-  int num_;
-  /// @brief the input size of each reduction
-  int dim_;
-  /// @brief a helper Blob used for summation (op_ == SUM)
-  Blob<Dtype> sum_multiplier_;
+template<typename Dtype>
+class ReductionLayer: public Layer<Dtype> {
+	public:
+		explicit ReductionLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Reduction";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		/// @brief the reduction operation performed by the layer
+		ReductionParameter_ReductionOp op_;
+		/// @brief a scalar coefficient applied to all outputs
+		Dtype coeff_;
+		/// @brief the index of the first input axis to reduce
+		int axis_;
+		/// @brief the number of reductions performed
+		int num_;
+		/// @brief the input size of each reduction
+		int dim_;
+		/// @brief a helper Blob used for summation (op_ == SUM)
+		Blob<Dtype> sum_multiplier_;
 };
 
 /**
  * @brief Ignores bottom blobs while producing no top blobs. (This is useful
  *        to suppress outputs during testing.)
  */
-template <typename Dtype>
-class SilenceLayer : public Layer<Dtype> {
- public:
-  explicit SilenceLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "Silence"; }
-  virtual inline int MinBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  // We can't define Forward_gpu here, since STUB_GPU will provide
-  // its own definition for CPU_ONLY mode.
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class SilenceLayer: public Layer<Dtype> {
+	public:
+		explicit SilenceLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		virtual inline const char* type() const {
+			return "Silence";
+		}
+		virtual inline int MinBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 0;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+		// We can't define Forward_gpu here, since STUB_GPU will provide
+		// its own definition for CPU_ONLY mode.
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -478,37 +554,43 @@ class SilenceLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class SoftmaxLayer : public Layer<Dtype> {
- public:
-  explicit SoftmaxLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {
-  }
-  ~SoftmaxLayer();
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Softmax"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int outer_num_;
-  int inner_num_;
-  int softmax_axis_;
-  /// sum_multiplier is used to carry out sum using BLAS
-  Blob<Dtype> sum_multiplier_;
-  /// scale is an intermediate Blob to hold temporary results.
-  Blob<Dtype> scale_;
+template<typename Dtype>
+class SoftmaxLayer: public Layer<Dtype> {
+	public:
+		explicit SoftmaxLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		~SoftmaxLayer();
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Softmax";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int outer_num_;
+		int inner_num_;
+		int softmax_axis_;
+		/// sum_multiplier is used to carry out sum using BLAS
+		Blob<Dtype> sum_multiplier_;
+		/// scale is an intermediate Blob to hold temporary results.
+		Blob<Dtype> scale_;
 };
 
 #ifdef USE_CUDNN
@@ -518,25 +600,25 @@ class SoftmaxLayer : public Layer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
- public:
-  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
-      : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNSoftmaxLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
+	public:
+	explicit CuDNNSoftmaxLayer(const LayerParameter& param)
+	: SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNSoftmaxLayer();
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t handle_;
+	cudnnTensorDescriptor_t bottom_desc_;
+	cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -546,30 +628,37 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class SplitLayer : public Layer<Dtype> {
- public:
-  explicit SplitLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Split"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
-  cl_kernel gpu_add_kernel;
+template<typename Dtype>
+class SplitLayer: public Layer<Dtype> {
+	public:
+		explicit SplitLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Split";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int count_;
+		cl_kernel gpu_add_kernel;
 };
 
 /**
@@ -578,35 +667,42 @@ class SplitLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class SliceLayer : public Layer<Dtype> {
- public:
-  explicit SliceLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Slice"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 2; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
-  int num_slices_;
-  int slice_size_;
-  int slice_axis_;
-  vector<int> slice_point_;
+template<typename Dtype>
+class SliceLayer: public Layer<Dtype> {
+	public:
+		explicit SliceLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Slice";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 2;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int count_;
+		int num_slices_;
+		int slice_size_;
+		int slice_axis_;
+		vector<int> slice_point_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 3958cb7e..442e4009 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -24,79 +24,94 @@ namespace caffe {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class BaseDataLayer : public Layer<Dtype> {
- public:
-  explicit BaseDataLayer(const LayerParameter& param);
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden except by the BasePrefetchingDataLayer.
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
- protected:
-  TransformationParameter transform_param_;
-  shared_ptr<DataTransformer<Dtype> > data_transformer_;
-  bool output_labels_;
+template<typename Dtype>
+class BaseDataLayer: public Layer<Dtype> {
+	public:
+		explicit BaseDataLayer(const LayerParameter& param);
+		// LayerSetUp: implements common data layer setup functionality, and calls
+		// DataLayerSetUp to do special data layer setup for individual layer types.
+		// This method may not be overridden except by the BasePrefetchingDataLayer.
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+		// Data layers have no bottoms, so reshaping is trivial.
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+
+	protected:
+		TransformationParameter transform_param_;
+		shared_ptr<DataTransformer<Dtype> > data_transformer_;
+		bool output_labels_;
 };
 
-template <typename Dtype>
-class BasePrefetchingDataLayer :
-    public BaseDataLayer<Dtype>, public InternalThread {
- public:
-  explicit BasePrefetchingDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param) {}
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden.
-  void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void CreatePrefetchThread();
-  virtual void JoinPrefetchThread();
-  // The thread's function
-  virtual void InternalThreadEntry() {}
-
- protected:
-  Blob<Dtype> prefetch_data_;
-  Blob<Dtype> prefetch_label_;
-  Blob<Dtype> transformed_data_;
+template<typename Dtype>
+class BasePrefetchingDataLayer:
+	public BaseDataLayer<Dtype>, public InternalThread {
+	public:
+		explicit BasePrefetchingDataLayer(const LayerParameter& param)
+			: BaseDataLayer<Dtype>(param) {
+		}
+		// LayerSetUp: implements common data layer setup functionality, and calls
+		// DataLayerSetUp to do special data layer setup for individual layer types.
+		// This method may not be overridden.
+		void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual void CreatePrefetchThread();
+		virtual void JoinPrefetchThread();
+		// The thread's function
+		virtual void InternalThreadEntry() {
+		}
+
+	protected:
+		Blob<Dtype> prefetch_data_;
+		Blob<Dtype> prefetch_label_;
+		Blob<Dtype> transformed_data_;
 };
 
-template <typename Dtype>
-class DataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit DataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~DataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlobs() const { return 2; }
-
- protected:
-  virtual void InternalThreadEntry();
-
-  shared_ptr<db::DB> db_;
-  shared_ptr<db::Cursor> cursor_;
+template<typename Dtype>
+class DataLayer: public BasePrefetchingDataLayer<Dtype> {
+	public:
+		explicit DataLayer(const LayerParameter& param)
+			: BasePrefetchingDataLayer<Dtype>(param) {
+		}
+		virtual ~DataLayer();
+		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Data";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+		virtual inline int MaxTopBlobs() const {
+			return 2;
+		}
+
+	protected:
+		virtual void InternalThreadEntry();
+
+		shared_ptr<db::DB> db_;
+		shared_ptr<db::Cursor> cursor_;
 };
 
 /**
@@ -104,31 +119,41 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class DummyDataLayer : public Layer<Dtype> {
- public:
-  explicit DummyDataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "DummyData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
-  vector<shared_ptr<Filler<Dtype> > > fillers_;
-  vector<bool> refill_;
+template<typename Dtype>
+class DummyDataLayer: public Layer<Dtype> {
+	public:
+		explicit DummyDataLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		// Data layers have no bottoms, so reshaping is trivial.
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		virtual inline const char* type() const {
+			return "DummyData";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+
+		vector<shared_ptr<Filler<Dtype> > > fillers_;
+		vector<bool> refill_;
 };
 
 /**
@@ -136,40 +161,50 @@ class DummyDataLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class HDF5DataLayer : public Layer<Dtype> {
- public:
-  explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual ~HDF5DataLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void LoadHDF5FileData(const char* filename);
-
-  std::vector<std::string> hdf_filenames_;
-  unsigned int num_files_;
-  unsigned int current_file_;
-  hsize_t current_row_;
-  std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
-  std::vector<unsigned int> data_permutation_;
-  std::vector<unsigned int> file_permutation_;
+template<typename Dtype>
+class HDF5DataLayer: public Layer<Dtype> {
+	public:
+		explicit HDF5DataLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual ~HDF5DataLayer();
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		// Data layers have no bottoms, so reshaping is trivial.
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		virtual inline const char* type() const {
+			return "HDF5Data";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		}
+		virtual void LoadHDF5FileData(const char* filename);
+
+		std::vector<std::string> hdf_filenames_;
+		unsigned int num_files_;
+		unsigned int current_file_;
+		hsize_t current_row_;
+		std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
+		std::vector<unsigned int> data_permutation_;
+		std::vector<unsigned int> file_permutation_;
 };
 
 /**
@@ -177,41 +212,51 @@ class HDF5DataLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class HDF5OutputLayer : public Layer<Dtype> {
- public:
-  explicit HDF5OutputLayer(const LayerParameter& param)
-      : Layer<Dtype>(param), file_opened_(false) {}
-  virtual ~HDF5OutputLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Output"; }
-  // TODO: no limit on the number of blobs
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
-  inline std::string file_name() const { return file_name_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void SaveBlobs();
-
-  bool file_opened_;
-  std::string file_name_;
-  hid_t file_id_;
-  Blob<Dtype> data_blob_;
-  Blob<Dtype> label_blob_;
+template<typename Dtype>
+class HDF5OutputLayer: public Layer<Dtype> {
+	public:
+		explicit HDF5OutputLayer(const LayerParameter& param)
+			: Layer<Dtype>(param), file_opened_(false) {
+		}
+		virtual ~HDF5OutputLayer();
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		// Data layers have no bottoms, so reshaping is trivial.
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		virtual inline const char* type() const {
+			return "HDF5Output";
+		}
+		// TODO: no limit on the number of blobs
+		virtual inline int ExactNumBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 0;
+		}
+
+		inline std::string file_name() const {
+			return file_name_;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void SaveBlobs();
+
+		bool file_opened_;
+		std::string file_name_;
+		hid_t file_id_;
+		Blob<Dtype> data_blob_;
+		Blob<Dtype> label_blob_;
 };
 
 /**
@@ -219,26 +264,33 @@ class HDF5OutputLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit ImageDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~ImageDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "ImageData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  virtual void ShuffleImages();
-  virtual void InternalThreadEntry();
-
-  vector<std::pair<std::string, int> > lines_;
-  int lines_id_;
+template<typename Dtype>
+class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
+	public:
+		explicit ImageDataLayer(const LayerParameter& param)
+			: BasePrefetchingDataLayer<Dtype>(param) {
+		}
+		virtual ~ImageDataLayer();
+		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "ImageData";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 2;
+		}
+
+	protected:
+		shared_ptr<Caffe::RNG> prefetch_rng_;
+		virtual void ShuffleImages();
+		virtual void InternalThreadEntry();
+
+		vector<std::pair<std::string, int> > lines_;
+		int lines_id_;
 };
 
 /**
@@ -246,44 +298,59 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class MemoryDataLayer : public BaseDataLayer<Dtype> {
- public:
-  explicit MemoryDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param), has_new_data_(false) {}
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MemoryData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
-  virtual void AddDatumVector(const vector<Datum>& datum_vector);
-  virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
-      const vector<int>& labels);
-
-  // Reset should accept const pointers, but can't, because the memory
-  //  will be given to Blob, which is mutable
-  void Reset(Dtype* data, Dtype* label, int n);
-  void set_batch_size(int new_size);
-
-  int batch_size() { return batch_size_; }
-  int channels() { return channels_; }
-  int height() { return height_; }
-  int width() { return width_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  int batch_size_, channels_, height_, width_, size_;
-  Dtype* data_;
-  Dtype* labels_;
-  int n_;
-  size_t pos_;
-  Blob<Dtype> added_data_;
-  Blob<Dtype> added_label_;
-  bool has_new_data_;
+template<typename Dtype>
+class MemoryDataLayer: public BaseDataLayer<Dtype> {
+	public:
+		explicit MemoryDataLayer(const LayerParameter& param)
+			: BaseDataLayer<Dtype>(param), has_new_data_(false) {
+		}
+		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "MemoryData";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 2;
+		}
+
+		virtual void AddDatumVector(const vector<Datum>& datum_vector);
+		virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
+			const vector<int>& labels);
+
+		// Reset should accept const pointers, but can't, because the memory
+		//  will be given to Blob, which is mutable
+		void Reset(Dtype* data, Dtype* label, int n);
+		void set_batch_size(int new_size);
+
+		int batch_size() {
+			return batch_size_;
+		}
+		int channels() {
+			return channels_;
+		}
+		int height() {
+			return height_;
+		}
+		int width() {
+			return width_;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		int batch_size_, channels_, height_, width_, size_;
+		Dtype* data_;
+		Dtype* labels_;
+		int n_;
+		size_t pos_;
+		Blob<Dtype> added_data_;
+		Blob<Dtype> added_label_;
+		bool has_new_data_;
 };
 
 /**
@@ -292,34 +359,43 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template <typename Dtype>
-class WindowDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit WindowDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~WindowDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "WindowData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  virtual unsigned int PrefetchRand();
-  virtual void InternalThreadEntry();
-
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  vector<std::pair<std::string, vector<int> > > image_database_;
-  enum WindowField { IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM };
-  vector<vector<float> > fg_windows_;
-  vector<vector<float> > bg_windows_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
-  bool has_mean_file_;
-  bool has_mean_values_;
-  bool cache_images_;
-  vector<std::pair<std::string, Datum > > image_database_cache_;
+template<typename Dtype>
+class WindowDataLayer: public BasePrefetchingDataLayer<Dtype> {
+	public:
+		explicit WindowDataLayer(const LayerParameter& param)
+			: BasePrefetchingDataLayer<Dtype>(param) {
+		}
+		virtual ~WindowDataLayer();
+		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "WindowData";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 0;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 2;
+		}
+
+	protected:
+		virtual unsigned int PrefetchRand();
+		virtual void InternalThreadEntry();
+
+		shared_ptr<Caffe::RNG> prefetch_rng_;
+		vector<std::pair<std::string, vector<int> > > image_database_;
+		enum WindowField {
+			IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM
+		};
+		vector<vector<float> > fg_windows_;
+		vector<vector<float> > bg_windows_;
+		Blob<Dtype> data_mean_;
+		vector<Dtype> mean_values_;
+		bool has_mean_file_;
+		bool has_mean_values_;
+		bool cache_images_;
+		vector<std::pair<std::string, Datum> > image_database_cache_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 0ad68c80..94c32366 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -13,136 +13,136 @@ namespace caffe {
  * @brief Applies common transformations to the input data, such as
  * scaling, mirroring, substracting the image mean...
  */
-template <typename Dtype>
+template<typename Dtype>
 class DataTransformer {
- public:
-  explicit DataTransformer(const TransformationParameter& param, Phase phase);
-  virtual ~DataTransformer() {}
-
-  /**
-   * @brief Initialize the Random number generations if needed by the
-   *    transformation.
-   */
-  void InitRand();
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See data_layer.cpp for an example.
-   */
-  void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Datum.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<Datum> & datum_vector,
-                Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Mat.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<cv::Mat> & mat_vector,
-                Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a cv::Mat
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
-   */
-  void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the same transformation defined in the data layer's
-   * transform_param block to all the num images in a input_blob.
-   *
-   * @param input_blob
-   *    A Blob containing the data to be transformed. It applies the same
-   *    transformation to all the num images in the blob.
-   * @param transformed_blob
-   *    This is destination blob, it will contain as many images as the
-   *    input blob. It can be part of top blob's data.
-   */
-  void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const Datum& datum);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const cv::Mat& cv_img);
-
- protected:
-   /**
-   * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
-   *
-   * @param n
-   *    The upperbound (exclusive) value of the random number.
-   * @return
-   *    A uniformly random integer value from ({0, 1, ..., n-1}).
-   */
-  virtual int Rand(int n);
-
-  void Transform(const Datum& datum, Dtype* transformed_data);
-  // Tranformation parameters
-  TransformationParameter param_;
-
-
-  shared_ptr<Caffe::RNG> rng_;
-  Phase phase_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
+	public:
+		explicit DataTransformer(const TransformationParameter& param, Phase phase);
+		virtual ~DataTransformer() {
+		}
+
+		/**
+		 * @brief Initialize the Random number generations if needed by the
+		 *    transformation.
+		 */
+		void InitRand();
+
+		/**
+		 * @brief Applies the transformation defined in the data layer's
+		 * transform_param block to the data.
+		 *
+		 * @param datum
+		 *    Datum containing the data to be transformed.
+		 * @param transformed_blob
+		 *    This is destination blob. It can be part of top blob's data if
+		 *    set_cpu_data() is used. See data_layer.cpp for an example.
+		 */
+		void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
+
+		/**
+		 * @brief Applies the transformation defined in the data layer's
+		 * transform_param block to a vector of Datum.
+		 *
+		 * @param datum_vector
+		 *    A vector of Datum containing the data to be transformed.
+		 * @param transformed_blob
+		 *    This is destination blob. It can be part of top blob's data if
+		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
+		 */
+		void Transform(const vector<Datum> & datum_vector,
+			Blob<Dtype>* transformed_blob);
+
+		/**
+		 * @brief Applies the transformation defined in the data layer's
+		 * transform_param block to a vector of Mat.
+		 *
+		 * @param mat_vector
+		 *    A vector of Mat containing the data to be transformed.
+		 * @param transformed_blob
+		 *    This is destination blob. It can be part of top blob's data if
+		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
+		 */
+		void Transform(const vector<cv::Mat> & mat_vector,
+			Blob<Dtype>* transformed_blob);
+
+		/**
+		 * @brief Applies the transformation defined in the data layer's
+		 * transform_param block to a cv::Mat
+		 *
+		 * @param cv_img
+		 *    cv::Mat containing the data to be transformed.
+		 * @param transformed_blob
+		 *    This is destination blob. It can be part of top blob's data if
+		 *    set_cpu_data() is used. See image_data_layer.cpp for an example.
+		 */
+		void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
+
+		/**
+		 * @brief Applies the same transformation defined in the data layer's
+		 * transform_param block to all the num images in a input_blob.
+		 *
+		 * @param input_blob
+		 *    A Blob containing the data to be transformed. It applies the same
+		 *    transformation to all the num images in the blob.
+		 * @param transformed_blob
+		 *    This is destination blob, it will contain as many images as the
+		 *    input blob. It can be part of top blob's data.
+		 */
+		void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
+
+		/**
+		 * @brief Infers the shape of transformed_blob will have when
+		 *    the transformation is applied to the data.
+		 *
+		 * @param datum
+		 *    Datum containing the data to be transformed.
+		 */
+		vector<int> InferBlobShape(const Datum& datum);
+		/**
+		 * @brief Infers the shape of transformed_blob will have when
+		 *    the transformation is applied to the data.
+		 *    It uses the first element to infer the shape of the blob.
+		 *
+		 * @param datum_vector
+		 *    A vector of Datum containing the data to be transformed.
+		 */
+		vector<int> InferBlobShape(const vector<Datum> & datum_vector);
+		/**
+		 * @brief Infers the shape of transformed_blob will have when
+		 *    the transformation is applied to the data.
+		 *    It uses the first element to infer the shape of the blob.
+		 *
+		 * @param mat_vector
+		 *    A vector of Mat containing the data to be transformed.
+		 */
+		vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
+		/**
+		 * @brief Infers the shape of transformed_blob will have when
+		 *    the transformation is applied to the data.
+		 *
+		 * @param cv_img
+		 *    cv::Mat containing the data to be transformed.
+		 */
+		vector<int> InferBlobShape(const cv::Mat& cv_img);
+
+	protected:
+		/**
+		 * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
+		 *
+		 * @param n
+		 *    The upperbound (exclusive) value of the random number.
+		 * @return
+		 *    A uniformly random integer value from ({0, 1, ..., n-1}).
+		 */
+		virtual int Rand(int n);
+
+		void Transform(const Datum& datum, Dtype* transformed_data);
+		// Tranformation parameters
+		TransformationParameter param_;
+
+		shared_ptr<Caffe::RNG> rng_;
+		Phase phase_;
+		Blob<Dtype> data_mean_;
+		vector<Dtype> mean_values_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 3806eeb6..c6cefedc 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -32,45 +32,51 @@
 #include "caffe/common.hpp"
 namespace caffe {
 
-class Device{
-public:
-    Device():numPlatforms(0),numDevices(0),device_id(INT_MIN){}
-    ~Device();
-    cl_uint numPlatforms;
-    cl_platform_id * platformIDs;
-    char platformName[64];
-    char openclVersion[64];
-    cl_uint numDevices;
-    cl_device_id * DeviceIDs;
-   
-    cl_context Context;
-    cl_command_queue CommandQueue;
-    cl_command_queue CommandQueue_helper;
-    cl_program Program; 
-    cl_device_id * pDevices;
-    int device_id;
+class Device {
+	public:
+		Device()
+			: numPlatforms(0), numDevices(0), device_id(INT_MIN) {
+		}
+		~Device();
+		cl_uint numPlatforms;
+		cl_platform_id * platformIDs;
+		char platformName[64];
+		char openclVersion[64];
+		cl_uint numDevices;
+		cl_device_id * DeviceIDs;
 
-    clblasOrder col;
-    clblasOrder row;
-    std::map<std::string, cl_kernel> Kernels;    
-         
-    cl_int Init(int device_id = -1); 
-    cl_int ConvertToString(std::string pFileName,std::string &Str);
-    void DisplayPlatformInfo();
-    void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
+		cl_context Context;
+		cl_command_queue CommandQueue;
+		cl_command_queue CommandQueue_helper;
+		cl_program Program;
+		cl_device_id * pDevices;
+		int device_id;
 
-    void GetDeviceInfo();
-    void DeviceQuery();    
-    int GetDevice(){return device_id;};
-    void BuildProgram(std::string kernel_dir);    
+		clblasOrder col;
+		clblasOrder row;
+		std::map<std::string, cl_kernel> Kernels;
 
-    template <typename T>
-    void DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str);
-    template <typename T>
-    void appendBitfield(T info, T value, std::string name, std::string &str);
-   
-    cl_kernel GetKernel(std::string kernel_name);    
-    void ReleaseKernels();
+		cl_int Init(int device_id = -1);
+		cl_int ConvertToString(std::string pFileName, std::string &Str);
+		void DisplayPlatformInfo();
+		void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
+
+		void GetDeviceInfo();
+		void DeviceQuery();
+		int GetDevice() {
+			return device_id;
+		}
+		;
+		void BuildProgram(std::string kernel_dir);
+
+		template<typename T>
+		void DisplayDeviceInfo(cl_device_id id, cl_device_info name,
+			std::string str);
+		template<typename T>
+		void appendBitfield(T info, T value, std::string name, std::string &str);
+
+		cl_kernel GetKernel(std::string kernel_name);
+		void ReleaseKernels();
 };
 extern std::string buildOption;
 extern Device amdDevice;
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 888f4a4b..6c47d7aa 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -16,113 +16,121 @@
 namespace caffe {
 
 /// @brief Fills a Blob with constant or randomly-generated data.
-template <typename Dtype>
+template<typename Dtype>
 class Filler {
- public:
-  explicit Filler(const FillerParameter& param) : filler_param_(param) {}
-  virtual ~Filler() {}
-  virtual void Fill(Blob<Dtype>* blob) = 0;
- protected:
-  FillerParameter filler_param_;
-};  // class Filler
-
+	public:
+		explicit Filler(const FillerParameter& param)
+			: filler_param_(param) {
+		}
+		virtual ~Filler() {
+		}
+		virtual void Fill(Blob<Dtype>* blob) = 0;
+		protected:
+		FillerParameter filler_param_;
+};
+// class Filler
 
 /// @brief Fills a Blob with constant values @f$ x = 0 @f$.
-template <typename Dtype>
-class ConstantFiller : public Filler<Dtype> {
- public:
-  explicit ConstantFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    const int count = blob->count();
-    const Dtype value = this->filler_param_.value();
-    CHECK(count);
-    for (int i = 0; i < count; ++i) {
-      data[i] = value;
-    }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class ConstantFiller: public Filler<Dtype> {
+	public:
+		explicit ConstantFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			Dtype* data = blob->mutable_cpu_data();
+			const int count = blob->count();
+			const Dtype value = this->filler_param_.value();
+			CHECK(count);
+			for (int i = 0; i < count; ++i) {
+				data[i] = value;
+			}
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
-template <typename Dtype>
-class UniformFiller : public Filler<Dtype> {
- public:
-  explicit UniformFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
-        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class UniformFiller: public Filler<Dtype> {
+	public:
+		explicit UniformFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			CHECK(blob->count());
+			caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
+				Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
-template <typename Dtype>
-class GaussianFiller : public Filler<Dtype> {
- public:
-  explicit GaussianFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    CHECK(blob->count());
-    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),
-        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
-    int sparse = this->filler_param_.sparse();
-    CHECK_GE(sparse, -1);
-    if (sparse >= 0) {
-      // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
-      // These have num == channels == 1; width is number of inputs; height is
-      // number of outputs.  The 'sparse' variable specifies the mean number
-      // of non-zero input weights for a given output.
-      CHECK_GE(blob->num_axes(), 1);
-      const int num_outputs = blob->shape(0);
-      Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
-      rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
-      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
-      caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
-      for (int i = 0; i < blob->count(); ++i) {
-        data[i] *= mask[i];
-      }
-    }
-  }
+template<typename Dtype>
+class GaussianFiller: public Filler<Dtype> {
+	public:
+		explicit GaussianFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			Dtype* data = blob->mutable_cpu_data();
+			CHECK(blob->count());
+			caffe_rng_gaussian<Dtype>(blob->count(),
+				Dtype(this->filler_param_.mean()),
+				Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
+			int sparse = this->filler_param_.sparse();
+			CHECK_GE(sparse, -1);
+			if (sparse >= 0) {
+				// Sparse initialization is implemented for "weight" blobs; i.e. matrices.
+				// These have num == channels == 1; width is number of inputs; height is
+				// number of outputs.  The 'sparse' variable specifies the mean number
+				// of non-zero input weights for a given output.
+				CHECK_GE(blob->num_axes(), 1);
+				const int num_outputs = blob->shape(0);
+				Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
+				rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
+				int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+				caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
+				for (int i = 0; i < blob->count(); ++i) {
+					data[i] *= mask[i];
+				}
+			}
+		}
 
- protected:
-  shared_ptr<SyncedMemory> rand_vec_;
+	protected:
+		shared_ptr<SyncedMemory> rand_vec_;
 };
 
 /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
  *         such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
  */
-template <typename Dtype>
-class PositiveUnitballFiller : public Filler<Dtype> {
- public:
-  explicit PositiveUnitballFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    DCHECK(blob->count());
-    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
-    // We expect the filler to not be called very frequently, so we will
-    // just use a simple implementation
-    int dim = blob->count() / blob->num();
-    CHECK(dim);
-    for (int i = 0; i < blob->num(); ++i) {
-      Dtype sum = 0;
-      for (int j = 0; j < dim; ++j) {
-        sum += data[i * dim + j];
-      }
-      for (int j = 0; j < dim; ++j) {
-        data[i * dim + j] /= sum;
-      }
-    }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class PositiveUnitballFiller: public Filler<Dtype> {
+	public:
+		explicit PositiveUnitballFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			Dtype* data = blob->mutable_cpu_data();
+			DCHECK(blob->count());
+			caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
+			// We expect the filler to not be called very frequently, so we will
+			// just use a simple implementation
+			int dim = blob->count() / blob->num();
+			CHECK(dim);
+			for (int i = 0; i < blob->num(); ++i) {
+				Dtype sum = 0;
+				for (int j = 0; j < dim; ++j) {
+					sum += data[i * dim + j];
+				}
+				for (int j = 0; j < dim; ++j) {
+					data[i * dim + j] /= sum;
+				}
+			}
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /**
@@ -141,29 +149,30 @@ class PositiveUnitballFiller : public Filler<Dtype> {
  *
  * TODO(dox): make notation in above comment consistent with rest & use LaTeX.
  */
-template <typename Dtype>
-class XavierFiller : public Filler<Dtype> {
- public:
-  explicit XavierFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
-    Dtype n = fan_in;  // default to fan_in
-    if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_AVERAGE) {
-      n = (fan_in + fan_out) / Dtype(2);
-    } else if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_FAN_OUT) {
-      n = fan_out;
-    }
-    Dtype scale = sqrt(Dtype(3) / n);
-    caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
-        blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class XavierFiller: public Filler<Dtype> {
+	public:
+		explicit XavierFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			CHECK(blob->count());
+			int fan_in = blob->count() / blob->num();
+			int fan_out = blob->count() / blob->channels();
+			Dtype n = fan_in;  // default to fan_in
+			if (this->filler_param_.variance_norm() ==
+				FillerParameter_VarianceNorm_AVERAGE) {
+				n = (fan_in + fan_out) / Dtype(2);
+			} else if (this->filler_param_.variance_norm() ==
+				FillerParameter_VarianceNorm_FAN_OUT) {
+				n = fan_out;
+			}
+			Dtype scale = sqrt(Dtype(3) / n);
+			caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
+				blob->mutable_cpu_data());
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /**
@@ -183,83 +192,85 @@ class XavierFiller : public Filler<Dtype> {
  * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this
  * is currently not the case for inner product layers.
  */
-template <typename Dtype>
-class MSRAFiller : public Filler<Dtype> {
- public:
-  explicit MSRAFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
-    Dtype n = fan_in;  // default to fan_in
-    if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_AVERAGE) {
-      n = (fan_in + fan_out) / Dtype(2);
-    } else if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_FAN_OUT) {
-      n = fan_out;
-    }
-    Dtype std = sqrt(Dtype(2) / n);
-    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
-        blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class MSRAFiller: public Filler<Dtype> {
+	public:
+		explicit MSRAFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			CHECK(blob->count());
+			int fan_in = blob->count() / blob->num();
+			int fan_out = blob->count() / blob->channels();
+			Dtype n = fan_in;  // default to fan_in
+			if (this->filler_param_.variance_norm() ==
+				FillerParameter_VarianceNorm_AVERAGE) {
+				n = (fan_in + fan_out) / Dtype(2);
+			} else if (this->filler_param_.variance_norm() ==
+				FillerParameter_VarianceNorm_FAN_OUT) {
+				n = fan_out;
+			}
+			Dtype std = sqrt(Dtype(2) / n);
+			caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
+				blob->mutable_cpu_data());
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /*!
-@brief Fills a Blob with coefficients for bilinear interpolation.
+ @brief Fills a Blob with coefficients for bilinear interpolation.
 
-A common use case is with the DeconvolutionLayer acting as upsampling.
-You can upsample a feature map with shape of (B, C, H, W) by any integer factor
-using the following proto.
-\code
-layer {
-  name: "upsample", type: "Deconvolution"
-  bottom: "{{bottom_name}}" top: "{{top_name}}"
-  convolution_param {
-    kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
-    num_output: {{C}} group: {{C}}
-    pad: {{ceil((factor - 1) / 2.)}}
-    weight_filler: { type: "bilinear" } bias_term: false
-  }
-  param { lr_mult: 0 decay_mult: 0 }
-}
-\endcode
-Please use this by replacing `{{}}` with your values. By specifying
-`num_output: {{C}} group: {{C}}`, it behaves as
-channel-wise convolution. The filter shape of this deconvolution layer will be
-(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
-interpolation kernel for every channel of the filter identically. The resulting
-shape of the top feature map will be (B, C, factor * H, factor * W).
-Note that the learning rate and the
-weight decay are set to 0 in order to keep coefficient values of bilinear
-interpolation unchanged during training. If you apply this to an image, this
-operation is equivalent to the following call in Python with Scikit.Image.
-\code{.py}
-out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
-\endcode
+ A common use case is with the DeconvolutionLayer acting as upsampling.
+ You can upsample a feature map with shape of (B, C, H, W) by any integer factor
+ using the following proto.
+ \code
+ layer {
+ name: "upsample", type: "Deconvolution"
+ bottom: "{{bottom_name}}" top: "{{top_name}}"
+ convolution_param {
+ kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
+ num_output: {{C}} group: {{C}}
+ pad: {{ceil((factor - 1) / 2.)}}
+ weight_filler: { type: "bilinear" } bias_term: false
+ }
+ param { lr_mult: 0 decay_mult: 0 }
+ }
+ \endcode
+ Please use this by replacing `{{}}` with your values. By specifying
+ `num_output: {{C}} group: {{C}}`, it behaves as
+ channel-wise convolution. The filter shape of this deconvolution layer will be
+ (C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
+ interpolation kernel for every channel of the filter identically. The resulting
+ shape of the top feature map will be (B, C, factor * H, factor * W).
+ Note that the learning rate and the
+ weight decay are set to 0 in order to keep coefficient values of bilinear
+ interpolation unchanged during training. If you apply this to an image, this
+ operation is equivalent to the following call in Python with Scikit.Image.
+ \code{.py}
+ out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
+ \endcode
  */
-template <typename Dtype>
-class BilinearFiller : public Filler<Dtype> {
- public:
-  explicit BilinearFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
-    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
-    Dtype* data = blob->mutable_cpu_data();
-    int f = ceil(blob->width() / 2.);
-    float c = (2 * f - 1 - f % 2) / (2. * f);
-    for (int i = 0; i < blob->count(); ++i) {
-      float x = i % blob->width();
-      float y = (i / blob->width()) % blob->height();
-      data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
-    }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+template<typename Dtype>
+class BilinearFiller: public Filler<Dtype> {
+	public:
+		explicit BilinearFiller(const FillerParameter& param)
+			: Filler<Dtype>(param) {
+		}
+		virtual void Fill(Blob<Dtype>* blob) {
+			CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
+			CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+			Dtype* data = blob->mutable_cpu_data();
+			int f = ceil(blob->width() / 2.);
+			float c = (2 * f - 1 - f % 2) / (2. * f);
+			for (int i = 0; i < blob->count(); ++i) {
+				float x = i % blob->width();
+				float y = (i / blob->width()) % blob->height();
+				data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+			}
+			CHECK_EQ(this->filler_param_.sparse(), -1)
+				<< "Sparsity not supported by this Filler.";
+		}
 };
 
 /**
@@ -268,27 +279,27 @@ class BilinearFiller : public Filler<Dtype> {
  * Ideally this would be replaced by a factory pattern, but we will leave it
  * this way for now.
  */
-template <typename Dtype>
+template<typename Dtype>
 Filler<Dtype>* GetFiller(const FillerParameter& param) {
-  const std::string& type = param.type();
-  if (type == "constant") {
-    return new ConstantFiller<Dtype>(param);
-  } else if (type == "gaussian") {
-    return new GaussianFiller<Dtype>(param);
-  } else if (type == "positive_unitball") {
-    return new PositiveUnitballFiller<Dtype>(param);
-  } else if (type == "uniform") {
-    return new UniformFiller<Dtype>(param);
-  } else if (type == "xavier") {
-    return new XavierFiller<Dtype>(param);
-  } else if (type == "msra") {
-    return new MSRAFiller<Dtype>(param);
-  } else if (type == "bilinear") {
-    return new BilinearFiller<Dtype>(param);
-  } else {
-    CHECK(false) << "Unknown filler name: " << param.type();
-  }
-  return (Filler<Dtype>*)(NULL);
+	const std::string& type = param.type();
+	if (type == "constant") {
+		return new ConstantFiller<Dtype>(param);
+	} else if (type == "gaussian") {
+		return new GaussianFiller<Dtype>(param);
+	} else if (type == "positive_unitball") {
+		return new PositiveUnitballFiller<Dtype>(param);
+	} else if (type == "uniform") {
+		return new UniformFiller<Dtype>(param);
+	} else if (type == "xavier") {
+		return new XavierFiller<Dtype>(param);
+	} else if (type == "msra") {
+		return new MSRAFiller<Dtype>(param);
+	} else if (type == "bilinear") {
+		return new BilinearFiller<Dtype>(param);
+	} else {
+		CHECK(false) << "Unknown filler name: " << param.type();
+	}
+	return (Filler<Dtype>*) (NULL);
 }
 
 }  // namespace caffe
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 815ca546..2df1806e 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -7,7 +7,9 @@
  Forward declare boost::thread instead of including boost/thread.hpp
  to avoid a boost/NVCC issues (#1009, #1010) on OSX.
  */
-namespace boost { class thread; }
+namespace boost {
+class thread;
+}
 
 namespace caffe {
 
@@ -17,24 +19,27 @@ namespace caffe {
  * by reimplementing the virutal function InternalThreadEntry.
  */
 class InternalThread {
- public:
-  InternalThread() : thread_() {}
-  virtual ~InternalThread();
+	public:
+		InternalThread()
+			: thread_() {
+		}
+		virtual ~InternalThread();
 
-  /** Returns true if the thread was successfully started. **/
-  bool StartInternalThread();
+		/** Returns true if the thread was successfully started. **/
+		bool StartInternalThread();
 
-  /** Will not return until the internal thread has exited. */
-  bool WaitForInternalThreadToExit();
+		/** Will not return until the internal thread has exited. */
+		bool WaitForInternalThreadToExit();
 
-  bool is_started() const;
+		bool is_started() const;
 
- protected:
-  /* Implement this method in your subclass
-      with the code you want your thread to run. */
-  virtual void InternalThreadEntry() {}
+	protected:
+		/* Implement this method in your subclass
+		 with the code you want your thread to run. */
+		virtual void InternalThreadEntry() {
+		}
 
-  shared_ptr<boost::thread> thread_;
+		shared_ptr<boost::thread> thread_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index e2eba196..b01ea959 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -23,446 +23,475 @@ namespace caffe {
  * gradients with respect to their input Blob%s, given the error gradients with
  * their output Blob%s.
  */
-template <typename Dtype>
+template<typename Dtype>
 class Layer {
- public:
-  /**
-   * You should not implement your own constructor. Any set up code should go
-   * to SetUp(), where the dimensions of the bottom blobs are provided to the
-   * layer.
-   */
-  explicit Layer(const LayerParameter& param)
-    : layer_param_(param) {
-      // Set phase and copy blobs (if there are any).
-      phase_ = param.phase();
-      if (layer_param_.blobs_size() > 0) {
-        blobs_.resize(layer_param_.blobs_size());
-        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
-          blobs_[i].reset(new Blob<Dtype>());
-          blobs_[i]->FromProto(layer_param_.blobs(i));
-        }
-      }
-    }
-  virtual ~Layer() {}
-
-  /**
-   * @brief Implements common layer setup functionality.
-   *
-   * @param bottom the preshaped input blobs
-   * @param top
-   *     the allocated but unshaped output blobs, to be shaped by Reshape
-   *
-   * Checks that the number of bottom and top blobs is correct.
-   * Calls LayerSetUp to do special layer setup for individual layer types,
-   * followed by Reshape to set up sizes of top blobs and internal buffers.
-   * Sets up the loss weight multiplier blobs for any non-zero loss weights.
-   * This method may not be overridden.
-   */
-  void SetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    CheckBlobCounts(bottom, top);
-    LayerSetUp(bottom, top);
-    Reshape(bottom, top);
-    SetLossWeights(top);
-  }
-
-  /**
-   * @brief Does layer-specific setup: your layer should implement this function
-   *        as well as Reshape.
-   *
-   * @param bottom
-   *     the preshaped input blobs, whose data fields store the input data for
-   *     this layer
-   * @param top
-   *     the allocated but unshaped output blobs
-   *
-   * This method should do one-time layer specific setup. This includes reading
-   * and processing relevent parameters from the <code>layer_param_</code>.
-   * Setting up the shapes of top blobs and internal buffers should be done in
-   * <code>Reshape</code>, which will be called before the forward pass to
-   * adjust the top blob sizes.
-   */
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  /**
-   * @brief Adjust the shapes of top blobs and internal buffers to accomodate
-   *        the shapes of the bottom blobs.
-   *
-   * @param bottom the input blobs, with the requested input shapes
-   * @param top the top blobs, which should be reshaped as needed
-   *
-   * This method should reshape top blobs as needed according to the shapes
-   * of the bottom (input) blobs, as well as reshaping any internal buffers
-   * and making any other necessary adjustments so that the layer can
-   * accomodate the bottom blobs.
-   */
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) = 0;
-
-  /**
-   * @brief Given the bottom blobs, compute the top blobs and the loss.
-   *
-   * @param bottom
-   *     the input blobs, whose data fields store the input data for this layer
-   * @param top
-   *     the preshaped output blobs, whose data fields will store this layers'
-   *     outputs
-   * \return The total loss from the layer.
-   *
-   * The Forward wrapper calls the relevant device wrapper function
-   * (Forward_cpu or Forward_gpu) to compute the top blob values given the
-   * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
-   * then computes and returns the loss.
-   *
-   * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
-   */
-  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Given the top blob error gradients, compute the bottom blob error
-   *        gradients.
-   *
-   * @param top
-   *     the output blobs, whose diff fields store the gradient of the error
-   *     with respect to themselves
-   * @param propagate_down
-   *     a vector with equal length to bottom, with each index indicating
-   *     whether to propagate the error gradients down to the bottom blob at
-   *     the corresponding index
-   * @param bottom
-   *     the input blobs, whose diff fields will store the gradient of the error
-   *     with respect to themselves after Backward is run
-   *
-   * The Backward wrapper calls the relevant device wrapper function
-   * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
-   * top blob diffs.
-   *
-   * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
-   */
-  inline void Backward(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom);
-
-  /**
-   * @brief Returns the vector of learnable parameter blobs.
-   */
-  vector<shared_ptr<Blob<Dtype> > >& blobs() {
-    return blobs_;
-  }
-
-  /**
-   * @brief Returns the layer parameter.
-   */
-  const LayerParameter& layer_param() const { return layer_param_; }
-
-  /**
-   * @brief Writes the layer parameter to a protocol buffer
-   */
-  virtual void ToProto(LayerParameter* param, bool write_diff = false);
-
-  /**
-   * @brief Returns the scalar loss associated with a top blob at a given index.
-   */
-  inline Dtype loss(const int top_index) const {
-    return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
-  }
-
-  /**
-   * @brief Sets the loss associated with a top blob at a given index.
-   */
-  inline void set_loss(const int top_index, const Dtype value) {
-    if (loss_.size() <= top_index) {
-      loss_.resize(top_index + 1, Dtype(0));
-    }
-    loss_[top_index] = value;
-  }
-
-  /**
-   * @brief Returns the layer type.
-   */
-  virtual inline const char* type() const { return ""; }
-
-  /**
-   * @brief Returns the exact number of bottom blobs required by the layer,
-   *        or -1 if no exact number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some exact number of bottom blobs.
-   */
-  virtual inline int ExactNumBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the minimum number of bottom blobs required by the layer,
-   *        or -1 if no minimum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some minimum number of bottom blobs.
-   */
-  virtual inline int MinBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the maximum number of bottom blobs required by the layer,
-   *        or -1 if no maximum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some maximum number of bottom blobs.
-   */
-  virtual inline int MaxBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the exact number of top blobs required by the layer,
-   *        or -1 if no exact number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some exact number of top blobs.
-   */
-  virtual inline int ExactNumTopBlobs() const { return -1; }
-  /**
-   * @brief Returns the minimum number of top blobs required by the layer,
-   *        or -1 if no minimum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some minimum number of top blobs.
-   */
-  virtual inline int MinTopBlobs() const { return -1; }
-  /**
-   * @brief Returns the maximum number of top blobs required by the layer,
-   *        or -1 if no maximum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some maximum number of top blobs.
-   */
-  virtual inline int MaxTopBlobs() const { return -1; }
-  /**
-   * @brief Returns true if the layer requires an equal number of bottom and
-   *        top blobs.
-   *
-   * This method should be overridden to return true if your layer expects an
-   * equal number of bottom and top blobs.
-   */
-  virtual inline bool EqualNumBottomTopBlobs() const { return false; }
-
-  /**
-   * @brief Return whether "anonymous" top blobs are created automatically
-   *        by the layer.
-   *
-   * If this method returns true, Net::Init will create enough "anonymous" top
-   * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
-   * MinTopBlobs().
-   */
-  virtual inline bool AutoTopBlobs() const { return false; }
-
-  /**
-   * @brief Return whether to allow force_backward for a given bottom blob
-   *        index.
-   *
-   * If AllowForceBackward(i) == false, we will ignore the force_backward
-   * setting and backpropagate to blob i only if it needs gradient information
-   * (as is done when force_backward == false).
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return true;
-  }
-
-  /**
-   * @brief Specifies whether the layer should compute gradients w.r.t. a
-   *        parameter at a particular index given by param_id.
-   *
-   * You can safely ignore false values and always compute gradients
-   * for all parameters, but possibly with wasteful computation.
-   */
-  inline bool param_propagate_down(const int param_id) {
-    return (param_propagate_down_.size() > param_id) ?
-        param_propagate_down_[param_id] : false;
-  }
-  /**
-   * @brief Sets whether the layer should compute gradients w.r.t. a
-   *        parameter at a particular index given by param_id.
-   */
-  inline void set_param_propagate_down(const int param_id, const bool value) {
-    if (param_propagate_down_.size() <= param_id) {
-      param_propagate_down_.resize(param_id + 1, true);
-    }
-    param_propagate_down_[param_id] = value;
-  }
-
-
- protected:
-  /** The protobuf that stores the layer parameters */
-  LayerParameter layer_param_;
-  /** The phase: TRAIN or TEST */
-  Phase phase_;
-  /** The vector that stores the learnable parameters as a set of blobs. */
-  vector<shared_ptr<Blob<Dtype> > > blobs_;
-  /** Vector indicating whether to compute the diff of each param blob. */
-  vector<bool> param_propagate_down_;
-
-  /** The vector that indicates whether each top blob has a non-zero weight in
-   *  the objective function. */
-  vector<Dtype> loss_;
-
-  /** @brief Using the CPU device, compute the layer output. */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) = 0;
-  /**
-   * @brief Using the GPU device, compute the layer output.
-   *        Fall back to Forward_cpu() if unavailable.
-   */
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    return Forward_cpu(bottom, top);
-  }
-
-  /**
-   * @brief Using the CPU device, compute the gradients for any parameters and
-   *        for the bottom blobs if propagate_down is true.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom) = 0;
-  /**
-   * @brief Using the GPU device, compute the gradients for any parameters and
-   *        for the bottom blobs if propagate_down is true.
-   *        Fall back to Backward_cpu() if unavailable.
-   */
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    Backward_cpu(top, propagate_down, bottom);
-  }
-
-  /**
-   * Called by the parent Layer's SetUp to check that the number of bottom
-   * and top Blobs provided as input match the expected numbers specified by
-   * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
-   */
-  virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
-                               const vector<Blob<Dtype>*>& top) {
-    if (ExactNumBottomBlobs() >= 0) {
-      CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
-          << type() << " Layer takes " << ExactNumBottomBlobs()
-          << " bottom blob(s) as input.";
-    }
-    if (MinBottomBlobs() >= 0) {
-      CHECK_LE(MinBottomBlobs(), bottom.size())
-          << type() << " Layer takes at least " << MinBottomBlobs()
-          << " bottom blob(s) as input.";
-    }
-    if (MaxBottomBlobs() >= 0) {
-      CHECK_GE(MaxBottomBlobs(), bottom.size())
-          << type() << " Layer takes at most " << MaxBottomBlobs()
-          << " bottom blob(s) as input.";
-    }
-    if (ExactNumTopBlobs() >= 0) {
-      CHECK_EQ(ExactNumTopBlobs(), top.size())
-          << type() << " Layer produces " << ExactNumTopBlobs()
-          << " top blob(s) as output.";
-    }
-    if (MinTopBlobs() >= 0) {
-      CHECK_LE(MinTopBlobs(), top.size())
-          << type() << " Layer produces at least " << MinTopBlobs()
-          << " top blob(s) as output.";
-    }
-    if (MaxTopBlobs() >= 0) {
-      CHECK_GE(MaxTopBlobs(), top.size())
-          << type() << " Layer produces at most " << MaxTopBlobs()
-          << " top blob(s) as output.";
-    }
-    if (EqualNumBottomTopBlobs()) {
-      CHECK_EQ(bottom.size(), top.size())
-          << type() << " Layer produces one top blob as output for each "
-          << "bottom blob input.";
-    }
-  }
-
-  /**
-   * Called by SetUp to initialize the weights associated with any top blobs in
-   * the loss function. Store non-zero loss weights in the diff blob.
-   */
-  inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
-    const int num_loss_weights = layer_param_.loss_weight_size();
-    if (num_loss_weights) {
-      CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
-          "unspecified or specified once per top blob.";
-      for (int top_id = 0; top_id < top.size(); ++top_id) {
-        const Dtype loss_weight = layer_param_.loss_weight(top_id);
-        if (loss_weight == Dtype(0)) { continue; }
-        this->set_loss(top_id, loss_weight);
-        const int count = top[top_id]->count();
-        Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
-        caffe_set(count, loss_weight, loss_multiplier);
-      }
-    }
-  }
-
-  DISABLE_COPY_AND_ASSIGN(Layer);
-};  // class Layer
+	public:
+		/**
+		 * You should not implement your own constructor. Any set up code should go
+		 * to SetUp(), where the dimensions of the bottom blobs are provided to the
+		 * layer.
+		 */
+		explicit Layer(const LayerParameter& param)
+			: layer_param_(param) {
+			// Set phase and copy blobs (if there are any).
+			phase_ = param.phase();
+			if (layer_param_.blobs_size() > 0) {
+				blobs_.resize(layer_param_.blobs_size());
+				for (int i = 0; i < layer_param_.blobs_size(); ++i) {
+					blobs_[i].reset(new Blob<Dtype>());
+					blobs_[i]->FromProto(layer_param_.blobs(i));
+				}
+			}
+		}
+		virtual ~Layer() {
+		}
+
+		/**
+		 * @brief Implements common layer setup functionality.
+		 *
+		 * @param bottom the preshaped input blobs
+		 * @param top
+		 *     the allocated but unshaped output blobs, to be shaped by Reshape
+		 *
+		 * Checks that the number of bottom and top blobs is correct.
+		 * Calls LayerSetUp to do special layer setup for individual layer types,
+		 * followed by Reshape to set up sizes of top blobs and internal buffers.
+		 * Sets up the loss weight multiplier blobs for any non-zero loss weights.
+		 * This method may not be overridden.
+		 */
+		void SetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			CheckBlobCounts(bottom, top);
+			LayerSetUp(bottom, top);
+			Reshape(bottom, top);
+			SetLossWeights(top);
+		}
+
+		/**
+		 * @brief Does layer-specific setup: your layer should implement this function
+		 *        as well as Reshape.
+		 *
+		 * @param bottom
+		 *     the preshaped input blobs, whose data fields store the input data for
+		 *     this layer
+		 * @param top
+		 *     the allocated but unshaped output blobs
+		 *
+		 * This method should do one-time layer specific setup. This includes reading
+		 * and processing relevent parameters from the <code>layer_param_</code>.
+		 * Setting up the shapes of top blobs and internal buffers should be done in
+		 * <code>Reshape</code>, which will be called before the forward pass to
+		 * adjust the top blob sizes.
+		 */
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+		}
+
+		/**
+		 * @brief Adjust the shapes of top blobs and internal buffers to accomodate
+		 *        the shapes of the bottom blobs.
+		 *
+		 * @param bottom the input blobs, with the requested input shapes
+		 * @param top the top blobs, which should be reshaped as needed
+		 *
+		 * This method should reshape top blobs as needed according to the shapes
+		 * of the bottom (input) blobs, as well as reshaping any internal buffers
+		 * and making any other necessary adjustments so that the layer can
+		 * accomodate the bottom blobs.
+		 */
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) = 0;
+
+		/**
+		 * @brief Given the bottom blobs, compute the top blobs and the loss.
+		 *
+		 * @param bottom
+		 *     the input blobs, whose data fields store the input data for this layer
+		 * @param top
+		 *     the preshaped output blobs, whose data fields will store this layers'
+		 *     outputs
+		 * \return The total loss from the layer.
+		 *
+		 * The Forward wrapper calls the relevant device wrapper function
+		 * (Forward_cpu or Forward_gpu) to compute the top blob values given the
+		 * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
+		 * then computes and returns the loss.
+		 *
+		 * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
+		 */
+		inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Given the top blob error gradients, compute the bottom blob error
+		 *        gradients.
+		 *
+		 * @param top
+		 *     the output blobs, whose diff fields store the gradient of the error
+		 *     with respect to themselves
+		 * @param propagate_down
+		 *     a vector with equal length to bottom, with each index indicating
+		 *     whether to propagate the error gradients down to the bottom blob at
+		 *     the corresponding index
+		 * @param bottom
+		 *     the input blobs, whose diff fields will store the gradient of the error
+		 *     with respect to themselves after Backward is run
+		 *
+		 * The Backward wrapper calls the relevant device wrapper function
+		 * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
+		 * top blob diffs.
+		 *
+		 * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
+		 */
+		inline void Backward(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down,
+			const vector<Blob<Dtype>*>& bottom);
+
+		/**
+		 * @brief Returns the vector of learnable parameter blobs.
+		 */
+		vector<shared_ptr<Blob<Dtype> > >& blobs() {
+			return blobs_;
+		}
+
+		/**
+		 * @brief Returns the layer parameter.
+		 */
+		const LayerParameter& layer_param() const {
+			return layer_param_;
+		}
+
+		/**
+		 * @brief Writes the layer parameter to a protocol buffer
+		 */
+		virtual void ToProto(LayerParameter* param, bool write_diff = false);
+
+		/**
+		 * @brief Returns the scalar loss associated with a top blob at a given index.
+		 */
+		inline Dtype loss(const int top_index) const {
+			return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
+		}
+
+		/**
+		 * @brief Sets the loss associated with a top blob at a given index.
+		 */
+		inline void set_loss(const int top_index, const Dtype value) {
+			if (loss_.size() <= top_index) {
+				loss_.resize(top_index + 1, Dtype(0));
+			}
+			loss_[top_index] = value;
+		}
+
+		/**
+		 * @brief Returns the layer type.
+		 */
+		virtual inline const char* type() const {
+			return "";
+		}
+
+		/**
+		 * @brief Returns the exact number of bottom blobs required by the layer,
+		 *        or -1 if no exact number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some exact number of bottom blobs.
+		 */
+		virtual inline int ExactNumBottomBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns the minimum number of bottom blobs required by the layer,
+		 *        or -1 if no minimum number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some minimum number of bottom blobs.
+		 */
+		virtual inline int MinBottomBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns the maximum number of bottom blobs required by the layer,
+		 *        or -1 if no maximum number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some maximum number of bottom blobs.
+		 */
+		virtual inline int MaxBottomBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns the exact number of top blobs required by the layer,
+		 *        or -1 if no exact number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some exact number of top blobs.
+		 */
+		virtual inline int ExactNumTopBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns the minimum number of top blobs required by the layer,
+		 *        or -1 if no minimum number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some minimum number of top blobs.
+		 */
+		virtual inline int MinTopBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns the maximum number of top blobs required by the layer,
+		 *        or -1 if no maximum number is required.
+		 *
+		 * This method should be overridden to return a non-negative value if your
+		 * layer expects some maximum number of top blobs.
+		 */
+		virtual inline int MaxTopBlobs() const {
+			return -1;
+		}
+		/**
+		 * @brief Returns true if the layer requires an equal number of bottom and
+		 *        top blobs.
+		 *
+		 * This method should be overridden to return true if your layer expects an
+		 * equal number of bottom and top blobs.
+		 */
+		virtual inline bool EqualNumBottomTopBlobs() const {
+			return false;
+		}
+
+		/**
+		 * @brief Return whether "anonymous" top blobs are created automatically
+		 *        by the layer.
+		 *
+		 * If this method returns true, Net::Init will create enough "anonymous" top
+		 * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
+		 * MinTopBlobs().
+		 */
+		virtual inline bool AutoTopBlobs() const {
+			return false;
+		}
+
+		/**
+		 * @brief Return whether to allow force_backward for a given bottom blob
+		 *        index.
+		 *
+		 * If AllowForceBackward(i) == false, we will ignore the force_backward
+		 * setting and backpropagate to blob i only if it needs gradient information
+		 * (as is done when force_backward == false).
+		 */
+		virtual inline bool AllowForceBackward(const int bottom_index) const {
+			return true;
+		}
+
+		/**
+		 * @brief Specifies whether the layer should compute gradients w.r.t. a
+		 *        parameter at a particular index given by param_id.
+		 *
+		 * You can safely ignore false values and always compute gradients
+		 * for all parameters, but possibly with wasteful computation.
+		 */
+		inline bool param_propagate_down(const int param_id) {
+			return
+				(param_propagate_down_.size() > param_id) ?
+					param_propagate_down_[param_id] : false;
+		}
+		/**
+		 * @brief Sets whether the layer should compute gradients w.r.t. a
+		 *        parameter at a particular index given by param_id.
+		 */
+		inline void set_param_propagate_down(const int param_id, const bool value) {
+			if (param_propagate_down_.size() <= param_id) {
+				param_propagate_down_.resize(param_id + 1, true);
+			}
+			param_propagate_down_[param_id] = value;
+		}
+
+	protected:
+		/** The protobuf that stores the layer parameters */
+		LayerParameter layer_param_;
+		/** The phase: TRAIN or TEST */
+		Phase phase_;
+		/** The vector that stores the learnable parameters as a set of blobs. */
+		vector<shared_ptr<Blob<Dtype> > > blobs_;
+		/** Vector indicating whether to compute the diff of each param blob. */
+		vector<bool> param_propagate_down_;
+
+		/** The vector that indicates whether each top blob has a non-zero weight in
+		 *  the objective function. */
+		vector<Dtype> loss_;
+
+		/** @brief Using the CPU device, compute the layer output. */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) = 0;
+		/**
+		 * @brief Using the GPU device, compute the layer output.
+		 *        Fall back to Forward_cpu() if unavailable.
+		 */
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			// LOG(WARNING) << "Using CPU code as backup.";
+			return Forward_cpu(bottom, top);
+		}
+
+		/**
+		 * @brief Using the CPU device, compute the gradients for any parameters and
+		 *        for the bottom blobs if propagate_down is true.
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down,
+			const vector<Blob<Dtype>*>& bottom) = 0;
+		/**
+		 * @brief Using the GPU device, compute the gradients for any parameters and
+		 *        for the bottom blobs if propagate_down is true.
+		 *        Fall back to Backward_cpu() if unavailable.
+		 */
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down,
+			const vector<Blob<Dtype>*>& bottom) {
+			// LOG(WARNING) << "Using CPU code as backup.";
+			Backward_cpu(top, propagate_down, bottom);
+		}
+
+		/**
+		 * Called by the parent Layer's SetUp to check that the number of bottom
+		 * and top Blobs provided as input match the expected numbers specified by
+		 * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
+		 */
+		virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			if (ExactNumBottomBlobs() >= 0) {
+				CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
+					<< type() << " Layer takes " << ExactNumBottomBlobs()
+					<< " bottom blob(s) as input.";
+			}
+			if (MinBottomBlobs() >= 0) {
+				CHECK_LE(MinBottomBlobs(), bottom.size())
+					<< type() << " Layer takes at least " << MinBottomBlobs()
+					<< " bottom blob(s) as input.";
+			}
+			if (MaxBottomBlobs() >= 0) {
+				CHECK_GE(MaxBottomBlobs(), bottom.size())
+					<< type() << " Layer takes at most " << MaxBottomBlobs()
+					<< " bottom blob(s) as input.";
+			}
+			if (ExactNumTopBlobs() >= 0) {
+				CHECK_EQ(ExactNumTopBlobs(), top.size())
+					<< type() << " Layer produces " << ExactNumTopBlobs()
+					<< " top blob(s) as output.";
+			}
+			if (MinTopBlobs() >= 0) {
+				CHECK_LE(MinTopBlobs(), top.size())
+					<< type() << " Layer produces at least " << MinTopBlobs()
+					<< " top blob(s) as output.";
+			}
+			if (MaxTopBlobs() >= 0) {
+				CHECK_GE(MaxTopBlobs(), top.size())
+					<< type() << " Layer produces at most " << MaxTopBlobs()
+					<< " top blob(s) as output.";
+			}
+			if (EqualNumBottomTopBlobs()) {
+				CHECK_EQ(bottom.size(), top.size())
+					<< type() << " Layer produces one top blob as output for each "
+					<< "bottom blob input.";
+			}
+		}
+
+		/**
+		 * Called by SetUp to initialize the weights associated with any top blobs in
+		 * the loss function. Store non-zero loss weights in the diff blob.
+		 */
+		inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
+			const int num_loss_weights = layer_param_.loss_weight_size();
+			if (num_loss_weights) {
+				CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
+					"unspecified or specified once per top blob.";
+				for (int top_id = 0; top_id < top.size(); ++top_id) {
+					const Dtype loss_weight = layer_param_.loss_weight(top_id);
+					if (loss_weight == Dtype(0)) {
+						continue;
+					}
+					this->set_loss(top_id, loss_weight);
+					const int count = top[top_id]->count();
+					Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
+					caffe_set(count, loss_weight, loss_multiplier);
+				}
+			}
+		}
+
+		DISABLE_COPY_AND_ASSIGN (Layer);
+};
+// class Layer
 
 // Forward and backward wrappers. You should implement the cpu and
 // gpu specific implementations instead, and should not change these
 // functions.
-template <typename Dtype>
+template<typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype loss = 0;
-  Reshape(bottom, top);
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    Forward_cpu(bottom, top);
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->cpu_data();
-      const Dtype* loss_weights = top[top_id]->cpu_diff();
-      loss += caffe_cpu_dot(count, data, loss_weights);
-    }
-    break;
-  case Caffe::GPU:
-    Forward_gpu(bottom, top);
+	const vector<Blob<Dtype>*>& top) {
+	Dtype loss = 0;
+	Reshape(bottom, top);
+	switch (Caffe::mode()) {
+		case Caffe::CPU:
+			Forward_cpu(bottom, top);
+			for (int top_id = 0; top_id < top.size(); ++top_id) {
+				if (!this->loss(top_id)) {
+					continue;
+				}
+				const int count = top[top_id]->count();
+				const Dtype* data = top[top_id]->cpu_data();
+				const Dtype* loss_weights = top[top_id]->cpu_diff();
+				loss += caffe_cpu_dot(count, data, loss_weights);
+			}
+			break;
+		case Caffe::GPU:
+			Forward_gpu(bottom, top);
 #ifndef CPU_ONLY
-    for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
-      const int count = top[top_id]->count();
-      const Dtype* data = top[top_id]->gpu_data();
-      const Dtype* loss_weights = top[top_id]->gpu_diff();
-      Dtype blob_loss = 0;
-      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
-      loss += blob_loss;
-    }
+			for (int top_id = 0; top_id < top.size(); ++top_id) {
+				if (!this->loss(top_id)) {
+					continue;
+				}
+				const int count = top[top_id]->count();
+				const Dtype* data = top[top_id]->gpu_data();
+				const Dtype* loss_weights = top[top_id]->gpu_diff();
+				Dtype blob_loss = 0;
+				caffe_gpu_dot(count, data, loss_weights, &blob_loss);
+				loss += blob_loss;
+			}
 #endif
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
-  }
-  return loss;
+			break;
+		default:
+			LOG(FATAL) << "Unknown caffe mode.";
+	}
+	return loss;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  switch (Caffe::mode()) {
-  case Caffe::CPU:
-    Backward_cpu(top, propagate_down, bottom);
-    break;
-  case Caffe::GPU:
-    Backward_gpu(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	switch (Caffe::mode()) {
+		case Caffe::CPU:
+			Backward_cpu(top, propagate_down, bottom);
+			break;
+		case Caffe::GPU:
+			Backward_gpu(top, propagate_down, bottom);
+			break;
+		default:
+			LOG(FATAL) << "Unknown caffe mode.";
+	}
 }
 
 // Serialize LayerParameter to protocol buffer
-template <typename Dtype>
+template<typename Dtype>
 void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
-  param->Clear();
-  param->CopyFrom(layer_param_);
-  param->clear_blobs();
-  for (int i = 0; i < blobs_.size(); ++i) {
-    blobs_[i]->ToProto(param->add_blobs(), write_diff);
-  }
+	param->Clear();
+	param->CopyFrom(layer_param_);
+	param->clear_blobs();
+	for (int i = 0; i < blobs_.size(); ++i) {
+		blobs_[i]->ToProto(param->add_blobs(), write_diff);
+	}
 }
 
 }  // namespace caffe
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index 2fcd9386..e679ae6a 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -47,69 +47,68 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 class Layer;
 
-template <typename Dtype>
+template<typename Dtype>
 class LayerRegistry {
- public:
-  typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
-  typedef std::map<string, Creator> CreatorRegistry;
-
-  static CreatorRegistry& Registry() {
-    static CreatorRegistry* g_registry_ = new CreatorRegistry();
-    return *g_registry_;
-  }
-
-  // Adds a creator.
-  static void AddCreator(const string& type, Creator creator) {
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 0)
-        << "Layer type " << type << " already registered.";
-    registry[type] = creator;
-  }
-
-  // Get a layer using a LayerParameter.
-  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-    LOG(INFO) << "Creating layer " << param.name();
-    const string& type = param.type();
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
-        << " (known types: " << LayerTypeList() << ")";
-    return registry[type](param);
-  }
-
- private:
-  // Layer registry should never be instantiated - everything is done with its
-  // static variables.
-  LayerRegistry() {}
-
-  static string LayerTypeList() {
-    CreatorRegistry& registry = Registry();
-    string layer_types;
-    for (typename CreatorRegistry::iterator iter = registry.begin();
-         iter != registry.end(); ++iter) {
-      if (iter != registry.begin()) {
-        layer_types += ", ";
-      }
-      layer_types += iter->first;
-    }
-    return layer_types;
-  }
+	public:
+		typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
+		typedef std::map<string, Creator> CreatorRegistry;
+
+		static CreatorRegistry& Registry() {
+			static CreatorRegistry* g_registry_ = new CreatorRegistry();
+			return *g_registry_;
+		}
+
+		// Adds a creator.
+		static void AddCreator(const string& type, Creator creator) {
+			CreatorRegistry& registry = Registry();
+			CHECK_EQ(registry.count(type), 0)
+				<< "Layer type " << type << " already registered.";
+			registry[type] = creator;
+		}
+
+		// Get a layer using a LayerParameter.
+		static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
+			LOG(INFO) << "Creating layer " << param.name();
+			const string& type = param.type();
+			CreatorRegistry& registry = Registry();
+			CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
+				<< " (known types: " << LayerTypeList() << ")";
+			return registry[type](param);
+		}
+
+	private:
+		// Layer registry should never be instantiated - everything is done with its
+		// static variables.
+		LayerRegistry() {
+		}
+
+		static string LayerTypeList() {
+			CreatorRegistry& registry = Registry();
+			string layer_types;
+			for (typename CreatorRegistry::iterator iter = registry.begin();
+				iter != registry.end(); ++iter) {
+				if (iter != registry.begin()) {
+					layer_types += ", ";
+				}
+				layer_types += iter->first;
+			}
+			return layer_types;
+		}
 };
 
-
-template <typename Dtype>
+template<typename Dtype>
 class LayerRegisterer {
- public:
-  LayerRegisterer(const string& type,
-                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
-    // LOG(INFO) << "Registering layer type: " << type;
-    LayerRegistry<Dtype>::AddCreator(type, creator);
-  }
+	public:
+		LayerRegisterer(const string& type,
+			shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
+			// LOG(INFO) << "Registering layer type: " << type;
+			LayerRegistry<Dtype>::AddCreator(type, creator);
+		}
 };
 
-
 #define REGISTER_LAYER_CREATOR(type, creator)                                  \
   static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);     \
   static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)    \
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index d1408fd7..9e74ca85 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -19,73 +19,81 @@ const float kLOG_THRESHOLD = 1e-20;
  * @brief Computes the classification accuracy for a one-of-many
  *        classification task.
  */
-template <typename Dtype>
-class AccuracyLayer : public Layer<Dtype> {
- public:
-  /**
-   * @param param provides AccuracyParameter accuracy_param,
-   *     with AccuracyLayer options:
-   *   - top_k (\b optional, default 1).
-   *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
-   *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
-   *     correct if the correct label is among the top 5 predicted labels.
-   */
-  explicit AccuracyLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Accuracy"; }
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ x @f$, a Blob with values in
-   *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
-   *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
-   *      label @f$ \hat{l}_n @f$ given by its maximal index:
-   *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels @f$ l @f$, an integer-valued Blob with values
-   *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
-   *      indicating the correct class label among the @f$ K @f$ classes
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      the computed accuracy: @f$
-   *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
-   *      @f$, where @f$
-   *      \delta\{\mathrm{condition}\} = \left\{
-   *         \begin{array}{lr}
-   *            1 & \mbox{if condition} \\
+template<typename Dtype>
+class AccuracyLayer: public Layer<Dtype> {
+	public:
+		/**
+		 * @param param provides AccuracyParameter accuracy_param,
+		 *     with AccuracyLayer options:
+		 *   - top_k (\b optional, default 1).
+		 *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
+		 *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
+		 *     correct if the correct label is among the top 5 predicted labels.
+		 */
+		explicit AccuracyLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Accuracy";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$ x @f$, a Blob with values in
+		 *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+		 *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
+		 *      label @f$ \hat{l}_n @f$ given by its maximal index:
+		 *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels @f$ l @f$, an integer-valued Blob with values
+		 *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+		 *      indicating the correct class label among the @f$ K @f$ classes
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      the computed accuracy: @f$
+		 *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
+		 *      @f$, where @f$
+		 *      \delta\{\mathrm{condition}\} = \left\{
+		 *         \begin{array}{lr}
+		 *            1 & \mbox{if condition} \\
    *            0 & \mbox{otherwise}
-   *         \end{array} \right.
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-
-  /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    for (int i = 0; i < propagate_down.size(); ++i) {
-      if (propagate_down[i]) { NOT_IMPLEMENTED; }
-    }
-  }
-
-  int label_axis_, outer_num_, inner_num_;
-
-  int top_k_;
-
-  /// Whether to ignore instances with a certain label.
-  bool has_ignore_label_;
-  /// The label indicating that an instance should be ignored.
-  int ignore_label_;
+		 *         \end{array} \right.
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+			for (int i = 0; i < propagate_down.size(); ++i) {
+				if (propagate_down[i]) {
+					NOT_IMPLEMENTED;
+				}
+			}
+		}
+
+		int label_axis_, outer_num_, inner_num_;
+
+		int top_k_;
+
+		/// Whether to ignore instances with a certain label.
+		bool has_ignore_label_;
+		/// The label indicating that an instance should be ignored.
+		int ignore_label_;
 };
 
 /**
@@ -96,33 +104,40 @@ class AccuracyLayer : public Layer<Dtype> {
  * LossLayers are typically only capable of backpropagating to their first input
  * -- the predictions.
  */
-template <typename Dtype>
-class LossLayer : public Layer<Dtype> {
- public:
-  explicit LossLayer(const LayerParameter& param)
-     : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-
-  /**
-   * @brief For convenience and backwards compatibility, instruct the Net to
-   *        automatically allocate a single top Blob for LossLayers, into which
-   *        they output their singleton loss, (even if the user didn't specify
-   *        one in the prototxt, etc.).
-   */
-  virtual inline bool AutoTopBlobs() const { return true; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-  /**
-   * We usually cannot backpropagate to the labels; ignore force_backward for
-   * these inputs.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return bottom_index != 1;
-  }
+template<typename Dtype>
+class LossLayer: public Layer<Dtype> {
+	public:
+		explicit LossLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(
+			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(
+			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+
+		virtual inline int ExactNumBottomBlobs() const {
+			return 2;
+		}
+
+		/**
+		 * @brief For convenience and backwards compatibility, instruct the Net to
+		 *        automatically allocate a single top Blob for LossLayers, into which
+		 *        they output their singleton loss, (even if the user didn't specify
+		 *        one in the prototxt, etc.).
+		 */
+		virtual inline bool AutoTopBlobs() const {
+			return true;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+		/**
+		 * We usually cannot backpropagate to the labels; ignore force_backward for
+		 * these inputs.
+		 */
+		virtual inline bool AllowForceBackward(const int bottom_index) const {
+			return bottom_index != 1;
+		}
 };
 
 /**
@@ -149,65 +164,70 @@ class LossLayer : public Layer<Dtype> {
  *          d = \left| \left| a_n - b_n \right| \right|_2^2 @f$.
  * This can be used to train siamese networks.
  */
-template <typename Dtype>
-class ContrastiveLossLayer : public LossLayer<Dtype> {
- public:
-  explicit ContrastiveLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), diff_() {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 3; }
-  virtual inline const char* type() const { return "ContrastiveLoss"; }
-  /**
-   * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
-   * to the first two inputs.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return bottom_index != 2;
-  }
-
- protected:
-  /// @copydoc ContrastiveLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the Contrastive error gradient w.r.t. the inputs.
-   *
-   * Computes the gradients with respect to the two input vectors (bottom[0] and
-   * bottom[1]), but not the similarity label (bottom[2]).
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times 1 \times 1) @f$
-   *      the features @f$a@f$; Backward fills their diff with
-   *      gradients if propagate_down[0]
-   *   -# @f$ (N \times C \times 1 \times 1) @f$
-   *      the features @f$b@f$; Backward fills their diff with gradients if
-   *      propagate_down[1]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> diff_;  // cached for backward pass
-  Blob<Dtype> dist_sq_;  // cached for backward pass
-  Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
-  Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+template<typename Dtype>
+class ContrastiveLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit ContrastiveLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param), diff_() {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline int ExactNumBottomBlobs() const {
+			return 3;
+		}
+		virtual inline const char* type() const {
+			return "ContrastiveLoss";
+		}
+		/**
+		 * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
+		 * to the first two inputs.
+		 */
+		virtual inline bool AllowForceBackward(const int bottom_index) const {
+			return bottom_index != 2;
+		}
+
+	protected:
+		/// @copydoc ContrastiveLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the Contrastive error gradient w.r.t. the inputs.
+		 *
+		 * Computes the gradients with respect to the two input vectors (bottom[0] and
+		 * bottom[1]), but not the similarity label (bottom[2]).
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times 1 \times 1) @f$
+		 *      the features @f$a@f$; Backward fills their diff with
+		 *      gradients if propagate_down[0]
+		 *   -# @f$ (N \times C \times 1 \times 1) @f$
+		 *      the features @f$b@f$; Backward fills their diff with gradients if
+		 *      propagate_down[1]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Blob<Dtype> diff_;  // cached for backward pass
+		Blob<Dtype> dist_sq_;  // cached for backward pass
+		Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+		Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
 };
 
 /**
@@ -236,69 +256,72 @@ class ContrastiveLossLayer : public LossLayer<Dtype> {
  * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve
  * linear least squares problems! We use it only as an instructive example.)
  */
-template <typename Dtype>
-class EuclideanLossLayer : public LossLayer<Dtype> {
- public:
-  explicit EuclideanLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), diff_() {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "EuclideanLoss"; }
-  /**
-   * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
-   * to both inputs -- override to return true and always allow force_backward.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return true;
-  }
-
- protected:
-  /// @copydoc EuclideanLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the Euclidean error gradient w.r.t. the inputs.
-   *
-   * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
-   * gradients with respect to the label inputs bottom[1] (but still only will
-   * if propagate_down[1] is set, due to being produced by learnable parameters
-   * or if force_backward is set). In fact, this layer is "commutative" -- the
-   * result is the same regardless of the order of the two bottoms.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$\hat{y}@f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial \hat{y}} =
-   *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
-   *      @f$ if propagate_down[0]
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the targets @f$y@f$; Backward fills their diff with gradients
-   *      @f$ \frac{\partial E}{\partial y} =
-   *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
-   *      @f$ if propagate_down[1]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> diff_;
+template<typename Dtype>
+class EuclideanLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit EuclideanLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param), diff_() {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "EuclideanLoss";
+		}
+		/**
+		 * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
+		 * to both inputs -- override to return true and always allow force_backward.
+		 */
+		virtual inline bool AllowForceBackward(const int bottom_index) const {
+			return true;
+		}
+
+	protected:
+		/// @copydoc EuclideanLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the Euclidean error gradient w.r.t. the inputs.
+		 *
+		 * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
+		 * gradients with respect to the label inputs bottom[1] (but still only will
+		 * if propagate_down[1] is set, due to being produced by learnable parameters
+		 * or if force_backward is set). In fact, this layer is "commutative" -- the
+		 * result is the same regardless of the order of the two bottoms.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$\hat{y}@f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial \hat{y}} =
+		 *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
+		 *      @f$ if propagate_down[0]
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the targets @f$y@f$; Backward fills their diff with gradients
+		 *      @f$ \frac{\partial E}{\partial y} =
+		 *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
+		 *      @f$ if propagate_down[1]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Blob<Dtype> diff_;
 };
 
 /**
@@ -344,48 +367,51 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
  * outside the InnerProductLayer and no other losses outside the
  * HingeLossLayer).
  */
-template <typename Dtype>
-class HingeLossLayer : public LossLayer<Dtype> {
- public:
-  explicit HingeLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "HingeLoss"; }
-
- protected:
-  /// @copydoc HingeLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the hinge loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$t@f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial t} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class HingeLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit HingeLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "HingeLoss";
+		}
+
+	protected:
+		/// @copydoc HingeLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the hinge loss error gradient w.r.t. the predictions.
+		 *
+		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+		 * if propagate_down[1] is set.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 *      propagate_down[1] must be false as we can't compute gradients with
+		 *      respect to the labels.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$t@f$; Backward computes diff
+		 *      @f$ \frac{\partial E}{\partial t} @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels -- ignored as we can't compute their error gradients
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -420,66 +446,75 @@ class HingeLossLayer : public LossLayer<Dtype> {
  *        \log(\hat{p}_{n,k})
  *      @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$.
  */
-template <typename Dtype>
-class InfogainLossLayer : public LossLayer<Dtype> {
- public:
-  explicit InfogainLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), infogain_() {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
-  // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
-  // file specified by LayerParameter.)
-  virtual inline int ExactNumBottomBlobs() const { return -1; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int MaxBottomBlobs() const { return 3; }
-
-  virtual inline const char* type() const { return "InfogainLoss"; }
-
- protected:
-  /// @copydoc InfogainLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the infogain loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set. (The same applies to the infogain matrix, if
-   * provided as bottom[2] rather than in the layer_param.)
-   *
-   * @param top output Blob vector (length 1), providing the error gradient
-   *      with respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels (similarly for propagate_down[2] and the
-   *      infogain matrix, if provided as bottom[2])
-   * @param bottom input Blob vector (length 2-3)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   *   -# @f$ (1 \times 1 \times K \times K) @f$
-   *      (\b optional) the information gain matrix -- ignored as its error
-   *      gradient computation is not implemented.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> infogain_;
+template<typename Dtype>
+class InfogainLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit InfogainLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param), infogain_() {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		// InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
+		// be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
+		// file specified by LayerParameter.)
+		virtual inline int ExactNumBottomBlobs() const {
+			return -1;
+		}
+		virtual inline int MinBottomBlobs() const {
+			return 2;
+		}
+		virtual inline int MaxBottomBlobs() const {
+			return 3;
+		}
+
+		virtual inline const char* type() const {
+			return "InfogainLoss";
+		}
+
+	protected:
+		/// @copydoc InfogainLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the infogain loss error gradient w.r.t. the predictions.
+		 *
+		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+		 * if propagate_down[1] is set. (The same applies to the infogain matrix, if
+		 * provided as bottom[2] rather than in the layer_param.)
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient
+		 *      with respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 *      propagate_down[1] must be false as we can't compute gradients with
+		 *      respect to the labels (similarly for propagate_down[2] and the
+		 *      infogain matrix, if provided as bottom[2])
+		 * @param bottom input Blob vector (length 2-3)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$ \hat{p} @f$; Backward computes diff
+		 *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels -- ignored as we can't compute their error gradients
+		 *   -# @f$ (1 \times 1 \times K \times K) @f$
+		 *      (\b optional) the information gain matrix -- ignored as its error
+		 *      gradient computation is not implemented.
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Blob<Dtype> infogain_;
 };
 
 /**
@@ -511,51 +546,54 @@ class InfogainLossLayer : public LossLayer<Dtype> {
  *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
  *      @f$
  */
-template <typename Dtype>
-class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
- public:
-  explicit MultinomialLogisticLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MultinomialLogisticLoss"; }
-
- protected:
-  /// @copydoc MultinomialLogisticLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the multinomial logistic loss error gradient w.r.t. the
-   *        predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit MultinomialLogisticLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "MultinomialLogisticLoss";
+		}
+
+	protected:
+		/// @copydoc MultinomialLogisticLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the multinomial logistic loss error gradient w.r.t. the
+		 *        predictions.
+		 *
+		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+		 * if propagate_down[1] is set.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 *      propagate_down[1] must be false as we can't compute gradients with
+		 *      respect to the labels.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$ \hat{p} @f$; Backward computes diff
+		 *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels -- ignored as we can't compute their error gradients
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -587,72 +625,75 @@ class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
  *              \right]
  *      @f$
  */
-template <typename Dtype>
-class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
- public:
-  explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param),
-          sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
-          sigmoid_output_(new Blob<Dtype>()) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; }
-
- protected:
-  /// @copydoc SigmoidCrossEntropyLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
-   *        predictions.
-   *
-   * Gradients cannot be computed with respect to the target inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as gradient computation with respect
-   *      to the targets is not implemented.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$x@f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial x} =
-   *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
-   *      @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// The internal SigmoidLayer used to map predictions to probabilities.
-  shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
-  /// sigmoid_output stores the output of the SigmoidLayer.
-  shared_ptr<Blob<Dtype> > sigmoid_output_;
-  /// bottom vector holder to call the underlying SigmoidLayer::Forward
-  vector<Blob<Dtype>*> sigmoid_bottom_vec_;
-  /// top vector holder to call the underlying SigmoidLayer::Forward
-  vector<Blob<Dtype>*> sigmoid_top_vec_;
+template<typename Dtype>
+class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
+	public:
+		explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param),
+				sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
+				sigmoid_output_(new Blob<Dtype>()) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "SigmoidCrossEntropyLoss";
+		}
+
+	protected:
+		/// @copydoc SigmoidCrossEntropyLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
+		 *        predictions.
+		 *
+		 * Gradients cannot be computed with respect to the target inputs (bottom[1]),
+		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+		 * if propagate_down[1] is set.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 *      propagate_down[1] must be false as gradient computation with respect
+		 *      to the targets is not implemented.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$x@f$; Backward computes diff
+		 *      @f$ \frac{\partial E}{\partial x} =
+		 *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
+		 *      @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels -- ignored as we can't compute their error gradients
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		/// The internal SigmoidLayer used to map predictions to probabilities.
+		shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+		/// sigmoid_output stores the output of the SigmoidLayer.
+		shared_ptr<Blob<Dtype> > sigmoid_output_;
+		/// bottom vector holder to call the underlying SigmoidLayer::Forward
+		vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+		/// top vector holder to call the underlying SigmoidLayer::Forward
+		vector<Blob<Dtype>*> sigmoid_top_vec_;
 };
 
 // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer.
-template <typename Dtype> class SoftmaxLayer;
+template<typename Dtype> class SoftmaxLayer;
 
 /**
  * @brief Computes the multinomial logistic loss for a one-of-many
@@ -668,7 +709,7 @@ template <typename Dtype> class SoftmaxLayer;
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the predictions @f$ x @f$, a Blob with values in
  *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
-ss
+ ss
  *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
  *      probability distribution over classes using the softmax function
  *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
@@ -683,92 +724,101 @@ ss
  *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
  *      @f$, for softmax output class probabilites @f$ \hat{p} @f$
  */
-template <typename Dtype>
-class SoftmaxWithLossLayer : public LossLayer<Dtype> {
- public:
-   /**
-    * @param param provides LossParameter loss_param, with options:
-    *  - ignore_label (optional)
-    *    Specify a label value that should be ignored when computing the loss.
-    *  - normalize (optional, default true)
-    *    If true, the loss is normalized by the number of (nonignored) labels
-    *    present; otherwise the loss is simply summed over spatial locations.
-    */
-  explicit SoftmaxWithLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-  ~SoftmaxWithLossLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SoftmaxWithLoss"; }
-  virtual inline int ExactNumTopBlobs() const { return -1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlobs() const { return 2; }
-
- protected:
-  /// @copydoc SoftmaxWithLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /**
-   * @brief Computes the softmax loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ x @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial x} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  void ocl_setup();
- 
-  /// The internal SoftmaxLayer used to map predictions to a distribution.
-  shared_ptr<Layer<Dtype> > softmax_layer_;
-  /// prob stores the output probability predictions from the SoftmaxLayer.
-  Blob<Dtype> prob_;
-  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
-  vector<Blob<Dtype>*> softmax_bottom_vec_;
-  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
-  vector<Blob<Dtype>*> softmax_top_vec_;
-  /// Whether to ignore instances with a certain label.
-  bool has_ignore_label_;
-  /// The label indicating that an instance should be ignored.
-  int ignore_label_;
-  /// Whether to normalize the loss by the total number of values present
-  /// (otherwise just by the batch size).
-  bool normalize_;
-
-  int softmax_axis_, outer_num_, inner_num_;
-  
- protected:
-   cl_kernel diff_kernel, scal_kernel, softmax_kernel;
-   cl_mem d_loss;
-   cl_kernel softmax_loss_fp_kernel;
-   cl_kernel softmax_loss_bp_kernel;
+template<typename Dtype>
+class SoftmaxWithLossLayer: public LossLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides LossParameter loss_param, with options:
+		 *  - ignore_label (optional)
+		 *    Specify a label value that should be ignored when computing the loss.
+		 *  - normalize (optional, default true)
+		 *    If true, the loss is normalized by the number of (nonignored) labels
+		 *    present; otherwise the loss is simply summed over spatial locations.
+		 */
+		explicit SoftmaxWithLossLayer(const LayerParameter& param)
+			: LossLayer<Dtype>(param) {
+		}
+		~SoftmaxWithLossLayer();
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "SoftmaxWithLoss";
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return -1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+		virtual inline int MaxTopBlobs() const {
+			return 2;
+		}
+
+	protected:
+		/// @copydoc SoftmaxWithLossLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		/**
+		 * @brief Computes the softmax loss error gradient w.r.t. the predictions.
+		 *
+		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+		 * if propagate_down[1] is set.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
+		 *      @f$\ell_i@f$ in the overall Net loss
+		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
+		 *      other layer of the Net.)
+		 * @param propagate_down see Layer::Backward.
+		 *      propagate_down[1] must be false as we can't compute gradients with
+		 *      respect to the labels.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the predictions @f$ x @f$; Backward computes diff
+		 *      @f$ \frac{\partial E}{\partial x} @f$
+		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+		 *      the labels -- ignored as we can't compute their error gradients
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		void ocl_setup();
+
+		/// The internal SoftmaxLayer used to map predictions to a distribution.
+		shared_ptr<Layer<Dtype> > softmax_layer_;
+		/// prob stores the output probability predictions from the SoftmaxLayer.
+		Blob<Dtype> prob_;
+		/// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+		vector<Blob<Dtype>*> softmax_bottom_vec_;
+		/// top vector holder used in call to the underlying SoftmaxLayer::Forward
+		vector<Blob<Dtype>*> softmax_top_vec_;
+		/// Whether to ignore instances with a certain label.
+		bool has_ignore_label_;
+		/// The label indicating that an instance should be ignored.
+		int ignore_label_;
+		/// Whether to normalize the loss by the total number of values present
+		/// (otherwise just by the batch size).
+		bool normalize_;
+
+		int softmax_axis_, outer_num_, inner_num_;
+
+	protected:
+		cl_kernel diff_kernel, scal_kernel, softmax_kernel;
+		cl_mem d_loss;
+		cl_kernel softmax_loss_fp_kernel;
+		cl_kernel softmax_loss_bp_kernel;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 5665df1e..68e631a1 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -20,249 +20,268 @@ namespace caffe {
  *
  * TODO(dox): more thorough description.
  */
-template <typename Dtype>
+template<typename Dtype>
 class Net {
- public:
-  explicit Net(const NetParameter& param);
-  explicit Net(const string& param_file, Phase phase);
-  virtual ~Net() {}
+	public:
+		explicit Net(const NetParameter& param);
+		explicit Net(const string& param_file, Phase phase);
+		virtual ~Net() {
+		}
 
-  /// @brief Initialize a network with a NetParameter.
-  void Init(const NetParameter& param);
+		/// @brief Initialize a network with a NetParameter.
+		void Init(const NetParameter& param);
 
-  /**
-   * @brief Run Forward with the input Blob%s already fed separately.
-   *
-   * You can get the input blobs using input_blobs().
-   */
-  const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
+		/**
+		 * @brief Run Forward with the input Blob%s already fed separately.
+		 *
+		 * You can get the input blobs using input_blobs().
+		 */
+		const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
 
-  /**
-   * The From and To variants of Forward and Backward operate on the
-   * (topological) ordering by which the net is specified. For general DAG
-   * networks, note that (1) computing from one layer to another might entail
-   * extra computation on unrelated branches, and (2) computation starting in
-   * the middle may be incorrect if all of the layers of a fan-in are not
-   * included.
-   */
-  Dtype ForwardFromTo(int start, int end);
-  Dtype ForwardFrom(int start);
-  Dtype ForwardTo(int end);
-  /// @brief Run forward using a set of bottom blobs, and return the result.
-  const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>* > & bottom,
-      Dtype* loss = NULL);
-  /**
-   * @brief Run forward using a serialized BlobProtoVector and return the
-   *        result as a serialized BlobProtoVector
-   */
-  string Forward(const string& input_blob_protos, Dtype* loss = NULL);
+		/**
+		 * The From and To variants of Forward and Backward operate on the
+		 * (topological) ordering by which the net is specified. For general DAG
+		 * networks, note that (1) computing from one layer to another might entail
+		 * extra computation on unrelated branches, and (2) computation starting in
+		 * the middle may be incorrect if all of the layers of a fan-in are not
+		 * included.
+		 */
+		Dtype ForwardFromTo(int start, int end);
+		Dtype ForwardFrom(int start);
+		Dtype ForwardTo(int end);
+		/// @brief Run forward using a set of bottom blobs, and return the result.
+		const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>*> & bottom,
+			Dtype* loss = NULL);
+		/**
+		 * @brief Run forward using a serialized BlobProtoVector and return the
+		 *        result as a serialized BlobProtoVector
+		 */
+		string Forward(const string& input_blob_protos, Dtype* loss = NULL);
 
-  /**
-   * The network backward should take no input and output, since it solely
-   * computes the gradient w.r.t the parameters, and the data has already been
-   * provided during the forward pass.
-   */
-  void Backward();
-  void BackwardFromTo(int start, int end);
-  void BackwardFrom(int start);
-  void BackwardTo(int end);
+		/**
+		 * The network backward should take no input and output, since it solely
+		 * computes the gradient w.r.t the parameters, and the data has already been
+		 * provided during the forward pass.
+		 */
+		void Backward();
+		void BackwardFromTo(int start, int end);
+		void BackwardFrom(int start);
+		void BackwardTo(int end);
 
-  /**
-   * @brief Reshape all layers from bottom to top.
-   *
-   * This is useful to propagate changes to layer sizes without running
-   * a forward pass, e.g. to compute output feature size.
-   */
-  void Reshape();
+		/**
+		 * @brief Reshape all layers from bottom to top.
+		 *
+		 * This is useful to propagate changes to layer sizes without running
+		 * a forward pass, e.g. to compute output feature size.
+		 */
+		void Reshape();
 
-  Dtype ForwardBackward(const vector<Blob<Dtype>* > & bottom) {
-    Dtype loss;
-    Forward(bottom, &loss);
-    Backward();
-    return loss;
-  }
+		Dtype ForwardBackward(const vector<Blob<Dtype>*> & bottom) {
+			Dtype loss;
+			Forward(bottom, &loss);
+			Backward();
+			return loss;
+		}
 
-  /// @brief Updates the network weights based on the diff values computed.
-  void Update();
+		/// @brief Updates the network weights based on the diff values computed.
+		void Update();
 
-  /**
-   * @brief For an already initialized net, implicitly copies (i.e., using no
-   *        additional memory) the pre-trained layers from another Net.
-   */
-  void ShareTrainedLayersWith(const Net* other);
-  // For an already initialized net, CopyTrainedLayersFrom() copies the already
-  // trained layers from another net parameter instance.
-  /**
-   * @brief For an already initialized net, copies the pre-trained layers from
-   *        another Net.
-   */
-  void CopyTrainedLayersFrom(const NetParameter& param);
-  void CopyTrainedLayersFrom(const string trained_filename);
-  /// @brief Writes the net to a proto.
-  void ToProto(NetParameter* param, bool write_diff = false) const;
+		/**
+		 * @brief For an already initialized net, implicitly copies (i.e., using no
+		 *        additional memory) the pre-trained layers from another Net.
+		 */
+		void ShareTrainedLayersWith(const Net* other);
+		// For an already initialized net, CopyTrainedLayersFrom() copies the already
+		// trained layers from another net parameter instance.
+		/**
+		 * @brief For an already initialized net, copies the pre-trained layers from
+		 *        another Net.
+		 */
+		void CopyTrainedLayersFrom(const NetParameter& param);
+		void CopyTrainedLayersFrom(const string trained_filename);
+		/// @brief Writes the net to a proto.
+		void ToProto(NetParameter* param, bool write_diff = false) const;
 
-  /// @brief returns the network name.
-  inline const string& name() const { return name_; }
-  /// @brief returns the layer names
-  inline const vector<string>& layer_names() const { return layer_names_; }
-  /// @brief returns the blob names
-  inline const vector<string>& blob_names() const { return blob_names_; }
-  /// @brief returns the blobs
-  inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
-    return blobs_;
-  }
-  /// @brief returns the layers
-  inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
-    return layers_;
-  }
-  /// @brief returns the phase: TRAIN or TEST
-  inline Phase phase() const { return phase_; }
-  /**
-   * @brief returns the bottom vecs for each layer -- usually you won't
-   *        need this unless you do per-layer checks such as gradients.
-   */
-  inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
-    return bottom_vecs_;
-  }
-  /**
-   * @brief returns the top vecs for each layer -- usually you won't
-   *        need this unless you do per-layer checks such as gradients.
-   */
-  inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
-    return top_vecs_;
-  }
-  inline const vector<vector<bool> >& bottom_need_backward() const {
-    return bottom_need_backward_;
-  }
-  inline const vector<Dtype>& blob_loss_weights() const {
-    return blob_loss_weights_;
-  }
-  inline const vector<bool>& layer_need_backward() const {
-    return layer_need_backward_;
-  }
-  /// @brief returns the parameters
-  inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
-    return params_;
-  }
-  /// @brief returns the parameter learning rate multipliers
-  inline const vector<float>& params_lr() const { return params_lr_; }
-  inline const vector<float>& params_weight_decay() const {
-    return params_weight_decay_;
-  }
-  const map<string, int>& param_names_index() const {
-    return param_names_index_;
-  }
-  inline const vector<int>& param_owners() const { return param_owners_; }
-  /// @brief Input and output blob numbers
-  inline int num_inputs() const { return net_input_blobs_.size(); }
-  inline int num_outputs() const { return net_output_blobs_.size(); }
-  inline const vector<Blob<Dtype>*>& input_blobs() const {
-    return net_input_blobs_;
-  }
-  inline const vector<Blob<Dtype>*>& output_blobs() const {
-    return net_output_blobs_;
-  }
-  inline const vector<int>& input_blob_indices() const {
-    return net_input_blob_indices_;
-  }
-  inline const vector<int>& output_blob_indices() const {
-    return net_output_blob_indices_;
-  }
-  bool has_blob(const string& blob_name) const;
-  const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
-  bool has_layer(const string& layer_name) const;
-  const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name) const;
+		/// @brief returns the network name.
+		inline const string& name() const {
+			return name_;
+		}
+		/// @brief returns the layer names
+		inline const vector<string>& layer_names() const {
+			return layer_names_;
+		}
+		/// @brief returns the blob names
+		inline const vector<string>& blob_names() const {
+			return blob_names_;
+		}
+		/// @brief returns the blobs
+		inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
+			return blobs_;
+		}
+		/// @brief returns the layers
+		inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
+			return layers_;
+		}
+		/// @brief returns the phase: TRAIN or TEST
+		inline Phase phase() const {
+			return phase_;
+		}
+		/**
+		 * @brief returns the bottom vecs for each layer -- usually you won't
+		 *        need this unless you do per-layer checks such as gradients.
+		 */
+		inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
+			return bottom_vecs_;
+		}
+		/**
+		 * @brief returns the top vecs for each layer -- usually you won't
+		 *        need this unless you do per-layer checks such as gradients.
+		 */
+		inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
+			return top_vecs_;
+		}
+		inline const vector<vector<bool> >& bottom_need_backward() const {
+			return bottom_need_backward_;
+		}
+		inline const vector<Dtype>& blob_loss_weights() const {
+			return blob_loss_weights_;
+		}
+		inline const vector<bool>& layer_need_backward() const {
+			return layer_need_backward_;
+		}
+		/// @brief returns the parameters
+		inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
+			return params_;
+		}
+		/// @brief returns the parameter learning rate multipliers
+		inline const vector<float>& params_lr() const {
+			return params_lr_;
+		}
+		inline const vector<float>& params_weight_decay() const {
+			return params_weight_decay_;
+		}
+		const map<string, int>& param_names_index() const {
+			return param_names_index_;
+		}
+		inline const vector<int>& param_owners() const {
+			return param_owners_;
+		}
+		/// @brief Input and output blob numbers
+		inline int num_inputs() const {
+			return net_input_blobs_.size();
+		}
+		inline int num_outputs() const {
+			return net_output_blobs_.size();
+		}
+		inline const vector<Blob<Dtype>*>& input_blobs() const {
+			return net_input_blobs_;
+		}
+		inline const vector<Blob<Dtype>*>& output_blobs() const {
+			return net_output_blobs_;
+		}
+		inline const vector<int>& input_blob_indices() const {
+			return net_input_blob_indices_;
+		}
+		inline const vector<int>& output_blob_indices() const {
+			return net_output_blob_indices_;
+		}
+		bool has_blob(const string& blob_name) const;
+		const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
+		bool has_layer(const string& layer_name) const;
+		const shared_ptr<Layer<Dtype> > layer_by_name(
+			const string& layer_name) const;
 
-  void set_debug_info(const bool value) { debug_info_ = value; }
+		void set_debug_info(const bool value) {
+			debug_info_ = value;
+		}
 
-  // Helpers for Init.
-  /**
-   * @brief Remove layers that the user specified should be excluded given the current
-   *        phase, level, and stage.
-   */
-  static void FilterNet(const NetParameter& param,
-      NetParameter* param_filtered);
-  /// @brief return whether NetState state meets NetStateRule rule
-  static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
-      const string& layer_name);
+		// Helpers for Init.
+		/**
+		 * @brief Remove layers that the user specified should be excluded given the current
+		 *        phase, level, and stage.
+		 */
+		static void FilterNet(const NetParameter& param,
+			NetParameter* param_filtered);
+		/// @brief return whether NetState state meets NetStateRule rule
+		static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
+			const string& layer_name);
 
- protected:
-  // Helpers for Init.
-  /// @brief Append a new input or top blob to the net.
-  void AppendTop(const NetParameter& param, const int layer_id,
-                 const int top_id, set<string>* available_blobs,
-                 map<string, int>* blob_name_to_idx);
-  /// @brief Append a new bottom blob to the net.
-  int AppendBottom(const NetParameter& param, const int layer_id,
-                   const int bottom_id, set<string>* available_blobs,
-                   map<string, int>* blob_name_to_idx);
-  /// @brief Append a new parameter blob to the net.
-  void AppendParam(const NetParameter& param, const int layer_id,
-                   const int param_id);
+	protected:
+		// Helpers for Init.
+		/// @brief Append a new input or top blob to the net.
+		void AppendTop(const NetParameter& param, const int layer_id,
+			const int top_id, set<string>* available_blobs,
+			map<string, int>* blob_name_to_idx);
+		/// @brief Append a new bottom blob to the net.
+		int AppendBottom(const NetParameter& param, const int layer_id,
+			const int bottom_id, set<string>* available_blobs,
+			map<string, int>* blob_name_to_idx);
+		/// @brief Append a new parameter blob to the net.
+		void AppendParam(const NetParameter& param, const int layer_id,
+			const int param_id);
 
-  /// @brief Helper for displaying debug info in Forward about input Blobs.
-  void InputDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Forward.
-  void ForwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Backward.
-  void BackwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Update.
-  void UpdateDebugInfo(const int param_id);
+		/// @brief Helper for displaying debug info in Forward about input Blobs.
+		void InputDebugInfo(const int layer_id);
+		/// @brief Helper for displaying debug info in Forward.
+		void ForwardDebugInfo(const int layer_id);
+		/// @brief Helper for displaying debug info in Backward.
+		void BackwardDebugInfo(const int layer_id);
+		/// @brief Helper for displaying debug info in Update.
+		void UpdateDebugInfo(const int param_id);
 
-  /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
-  void GetLearningRateAndWeightDecay();
+		/// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
+		void GetLearningRateAndWeightDecay();
 
-  /// @brief The network name
-  string name_;
-  /// @brief The phase: TRAIN or TEST
-  Phase phase_;
-  /// @brief Individual layers in the net
-  vector<shared_ptr<Layer<Dtype> > > layers_;
-  vector<string> layer_names_;
-  map<string, int> layer_names_index_;
-  vector<bool> layer_need_backward_;
-  /// @brief the blobs storing intermediate results between the layer.
-  vector<shared_ptr<Blob<Dtype> > > blobs_;
-  vector<string> blob_names_;
-  map<string, int> blob_names_index_;
-  vector<bool> blob_need_backward_;
-  /// bottom_vecs stores the vectors containing the input for each layer.
-  /// They don't actually host the blobs (blobs_ does), so we simply store
-  /// pointers.
-  vector<vector<Blob<Dtype>*> > bottom_vecs_;
-  vector<vector<int> > bottom_id_vecs_;
-  vector<vector<bool> > bottom_need_backward_;
-  /// top_vecs stores the vectors containing the output for each layer
-  vector<vector<Blob<Dtype>*> > top_vecs_;
-  vector<vector<int> > top_id_vecs_;
-  /// Vector of weight in the loss (or objective) function of each net blob,
-  /// indexed by blob_id.
-  vector<Dtype> blob_loss_weights_;
-  vector<vector<int> > param_id_vecs_;
-  vector<int> param_owners_;
-  vector<string> param_display_names_;
-  vector<pair<int, int> > param_layer_indices_;
-  map<string, int> param_names_index_;
-  /// blob indices for the input and the output of the net
-  vector<int> net_input_blob_indices_;
-  vector<int> net_output_blob_indices_;
-  vector<Blob<Dtype>*> net_input_blobs_;
-  vector<Blob<Dtype>*> net_output_blobs_;
-  /// The parameters in the network.
-  vector<shared_ptr<Blob<Dtype> > > params_;
-  /// the learning rate multipliers
-  vector<float> params_lr_;
-  /// the weight decay multipliers
-  vector<float> params_weight_decay_;
-  /// The bytes of memory used by this net
-  size_t memory_used_;
-  /// Whether to compute and display debug info for the net.
-  bool debug_info_;
+		/// @brief The network name
+		string name_;
+		/// @brief The phase: TRAIN or TEST
+		Phase phase_;
+		/// @brief Individual layers in the net
+		vector<shared_ptr<Layer<Dtype> > > layers_;
+		vector<string> layer_names_;
+		map<string, int> layer_names_index_;
+		vector<bool> layer_need_backward_;
+		/// @brief the blobs storing intermediate results between the layer.
+		vector<shared_ptr<Blob<Dtype> > > blobs_;
+		vector<string> blob_names_;
+		map<string, int> blob_names_index_;
+		vector<bool> blob_need_backward_;
+		/// bottom_vecs stores the vectors containing the input for each layer.
+		/// They don't actually host the blobs (blobs_ does), so we simply store
+		/// pointers.
+		vector<vector<Blob<Dtype>*> > bottom_vecs_;
+		vector<vector<int> > bottom_id_vecs_;
+		vector<vector<bool> > bottom_need_backward_;
+		/// top_vecs stores the vectors containing the output for each layer
+		vector<vector<Blob<Dtype>*> > top_vecs_;
+		vector<vector<int> > top_id_vecs_;
+		/// Vector of weight in the loss (or objective) function of each net blob,
+		/// indexed by blob_id.
+		vector<Dtype> blob_loss_weights_;
+		vector<vector<int> > param_id_vecs_;
+		vector<int> param_owners_;
+		vector<string> param_display_names_;
+		vector<pair<int, int> > param_layer_indices_;
+		map<string, int> param_names_index_;
+		/// blob indices for the input and the output of the net
+		vector<int> net_input_blob_indices_;
+		vector<int> net_output_blob_indices_;
+		vector<Blob<Dtype>*> net_input_blobs_;
+		vector<Blob<Dtype>*> net_output_blobs_;
+		/// The parameters in the network.
+		vector<shared_ptr<Blob<Dtype> > > params_;
+		/// the learning rate multipliers
+		vector<float> params_lr_;
+		/// the weight decay multipliers
+		vector<float> params_weight_decay_;
+		/// The bytes of memory used by this net
+		size_t memory_used_;
+		/// Whether to compute and display debug info for the net.
+		bool debug_info_;
 
-  DISABLE_COPY_AND_ASSIGN(Net);
+		DISABLE_COPY_AND_ASSIGN (Net);
 };
 
-
 }  // namespace caffe
 
 #endif  // CAFFE_NET_HPP_
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index cf6d645a..5606ff65 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -22,16 +22,21 @@ namespace caffe {
  *        each element of the output depends only on the corresponding input
  *        element.
  */
-template <typename Dtype>
-class NeuronLayer : public Layer<Dtype> {
- public:
-  explicit NeuronLayer(const LayerParameter& param)
-     : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
+template<typename Dtype>
+class NeuronLayer: public Layer<Dtype> {
+	public:
+		explicit NeuronLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
 };
 
 /**
@@ -44,46 +49,53 @@ class NeuronLayer : public Layer<Dtype> {
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the computed outputs @f$ y = |x| @f$
  */
-template <typename Dtype>
-class AbsValLayer : public NeuronLayer<Dtype> {
- public:
-  explicit AbsValLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "AbsVal"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /// @copydoc AbsValLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the absolute value inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class AbsValLayer: public NeuronLayer<Dtype> {
+	public:
+		explicit AbsValLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "AbsVal";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		/// @copydoc AbsValLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the absolute value inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x} =
+		 *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -103,41 +115,44 @@ class AbsValLayer : public NeuronLayer<Dtype> {
  *         \end{array} \right.
  *      @f$
  */
-template <typename Dtype>
-class BNLLLayer : public NeuronLayer<Dtype> {
- public:
-  explicit BNLLLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "BNLL"; }
-
- protected:
-  /// @copydoc BNLLLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the BNLL inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class BNLLLayer: public NeuronLayer<Dtype> {
+	public:
+		explicit BNLLLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "BNLL";
+		}
+
+	protected:
+		/// @copydoc BNLLLayer
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the BNLL inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 2)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x}
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -151,64 +166,66 @@ class BNLLLayer : public NeuronLayer<Dtype> {
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the computed outputs @f$ y = |x| @f$
  */
-template <typename Dtype>
-class DropoutLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides DropoutParameter dropout_param,
-   *     with DropoutLayer options:
-   *   - dropout_ratio (\b optional, default 0.5).
-   *     Sets the probability @f$ p @f$ that any given unit is dropped.
-   */
-  explicit DropoutLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Dropout"; }
-  virtual ~DropoutLayer();
-  void ocl_setup(int bottom_count);
-  cl_mem MaskMem;
-  cl_kernel ocl_Kernel_Fwd;
-  cl_kernel ocl_Kernel_Bwd;
-  cl_kernel rng_kernel;
-
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs. At training time, we have @f$
-   *      y_{\mbox{train}} = \left\{
-   *         \begin{array}{ll}
-   *            \frac{x}{1 - p} & \mbox{if } u > p \\
+template<typename Dtype>
+class DropoutLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides DropoutParameter dropout_param,
+		 *     with DropoutLayer options:
+		 *   - dropout_ratio (\b optional, default 0.5).
+		 *     Sets the probability @f$ p @f$ that any given unit is dropped.
+		 */
+		explicit DropoutLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Dropout";
+		}
+		virtual ~DropoutLayer();
+		void ocl_setup(int bottom_count);
+		cl_mem MaskMem;
+		cl_kernel ocl_Kernel_Fwd;
+		cl_kernel ocl_Kernel_Bwd;
+		cl_kernel rng_kernel;
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs. At training time, we have @f$
+		 *      y_{\mbox{train}} = \left\{
+		 *         \begin{array}{ll}
+		 *            \frac{x}{1 - p} & \mbox{if } u > p \\
    *            0 & \mbox{otherwise}
-   *         \end{array} \right.
-   *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
-   *      input at each iteration. At test time, we simply have
-   *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
-  Blob<unsigned int> rand_vec_;
-  /// the probability @f$ p @f$ of dropping any input
-  Dtype threshold_;
-  /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
-  Dtype scale_;
-  unsigned int uint_thres_;
+		 *         \end{array} \right.
+		 *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
+		 *      input at each iteration. At test time, we simply have
+		 *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		/// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
+		Blob<unsigned int> rand_vec_;
+		/// the probability @f$ p @f$ of dropping any input
+		Dtype threshold_;
+		/// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
+		Dtype scale_;
+		unsigned int uint_thres_;
 };
 
 /**
@@ -216,63 +233,66 @@ class DropoutLayer : public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and base @f$ \gamma @f$.
  */
-template <typename Dtype>
-class ExpLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ExpParameter exp_param,
-   *     with ExpLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-   *         the base @f$ \gamma @f$
-   */
-  explicit ExpLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Exp"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \gamma ^ {\alpha x + \beta}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the exp inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Dtype inner_scale_, outer_scale_;
+template<typename Dtype>
+class ExpLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides ExpParameter exp_param,
+		 *     with ExpLayer options:
+		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+		 *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+		 *         the base @f$ \gamma @f$
+		 */
+		explicit ExpLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Exp";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = \gamma ^ {\alpha x + \beta}
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the exp inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x} =
+		 *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Dtype inner_scale_, outer_scale_;
 };
 
 /**
@@ -280,65 +300,68 @@ class ExpLayer : public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and base @f$ \gamma @f$.
  */
-template <typename Dtype>
-class LogLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides LogParameter log_param,
-   *     with LogLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-   *         the base @f$ \gamma @f$
-   */
-  explicit LogLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Log"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = log_{\gamma}(\alpha x + \beta)
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the exp inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Dtype base_scale_;
-  Dtype input_scale_, input_shift_;
-  Dtype backward_num_scale_;
+template<typename Dtype>
+class LogLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides LogParameter log_param,
+		 *     with LogLayer options:
+		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+		 *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+		 *         the base @f$ \gamma @f$
+		 */
+		explicit LogLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Log";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = log_{\gamma}(\alpha x + \beta)
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the exp inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x} =
+		 *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		Dtype base_scale_;
+		Dtype input_scale_, input_shift_;
+		Dtype backward_num_scale_;
 };
 
 /**
@@ -346,141 +369,146 @@ class LogLayer : public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and power @f$ \gamma @f$.
  */
-template <typename Dtype>
-class PowerLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides PowerParameter power_param,
-   *     with PowerLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - power (\b optional, default 1) the power @f$ \gamma @f$
-   */
-  explicit PowerLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Power"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = (\alpha x + \beta) ^ \gamma
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the power inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y}
-   *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
-   *            \frac{\partial E}{\partial y}
-   *            \frac{\alpha \gamma y}{\alpha x + \beta}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// @brief @f$ \gamma @f$ from layer_param_.power_param()
-  Dtype power_;
-  /// @brief @f$ \alpha @f$ from layer_param_.power_param()
-  Dtype scale_;
-  /// @brief @f$ \beta @f$ from layer_param_.power_param()
-  Dtype shift_;
-  /// @brief Result of @f$ \alpha \gamma @f$
-  Dtype diff_scale_;
+template<typename Dtype>
+class PowerLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides PowerParameter power_param,
+		 *     with PowerLayer options:
+		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+		 *   - power (\b optional, default 1) the power @f$ \gamma @f$
+		 */
+		explicit PowerLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Power";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = (\alpha x + \beta) ^ \gamma
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the power inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x} =
+		 *            \frac{\partial E}{\partial y}
+		 *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
+		 *            \frac{\partial E}{\partial y}
+		 *            \frac{\alpha \gamma y}{\alpha x + \beta}
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		/// @brief @f$ \gamma @f$ from layer_param_.power_param()
+		Dtype power_;
+		/// @brief @f$ \alpha @f$ from layer_param_.power_param()
+		Dtype scale_;
+		/// @brief @f$ \beta @f$ from layer_param_.power_param()
+		Dtype shift_;
+		/// @brief Result of @f$ \alpha \gamma @f$
+		Dtype diff_scale_;
 };
 
 /**
  * @brief Rectified Linear Unit non-linearity @f$ y = \max(0, x) @f$.
  *        The simple max is fast to compute, and the function does not saturate.
  */
-template <typename Dtype>
-class ReLULayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ReLUParameter relu_param,
-   *     with ReLULayer options:
-   *   - negative_slope (\b optional, default 0).
-   *     the value @f$ \nu @f$ by which negative values are multiplied.
-   */
-  explicit ReLULayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {
-    }
-  virtual inline const char* type() const { return "ReLU"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \max(0, x)
-   *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
-   *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the ReLU inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} = \left\{
-   *        \begin{array}{lr}
-   *            0 & \mathrm{if} \; x \le 0 \\
+template<typename Dtype>
+class ReLULayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides ReLUParameter relu_param,
+		 *     with ReLULayer options:
+		 *   - negative_slope (\b optional, default 0).
+		 *     the value @f$ \nu @f$ by which negative values are multiplied.
+		 */
+		explicit ReLULayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual inline const char* type() const {
+			return "ReLU";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = \max(0, x)
+		 *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
+		 *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the ReLU inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x} = \left\{
+		 *        \begin{array}{lr}
+		 *            0 & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-   *        \end{array} \right.
-   *      @f$ if propagate_down[0], by default.
-   *      If a non-zero negative_slope @f$ \nu @f$ is provided,
-   *      the computed gradients are @f$
-   *        \frac{\partial E}{\partial x} = \left\{
-   *        \begin{array}{lr}
-   *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
+		 *        \end{array} \right.
+		 *      @f$ if propagate_down[0], by default.
+		 *      If a non-zero negative_slope @f$ \nu @f$ is provided,
+		 *      the computed gradients are @f$
+		 *        \frac{\partial E}{\partial x} = \left\{
+		 *        \begin{array}{lr}
+		 *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-   *        \end{array} \right.
-   *      @f$.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		 *        \end{array} \right.
+		 *      @f$.
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -489,25 +517,25 @@ class ReLULayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNReLULayer : public ReLULayer<Dtype> {
- public:
-  explicit CuDNNReLULayer(const LayerParameter& param)
-      : ReLULayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNReLULayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
+	public:
+	explicit CuDNNReLULayer(const LayerParameter& param)
+	: ReLULayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNReLULayer();
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t handle_;
+	cudnnTensorDescriptor_t bottom_desc_;
+	cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -519,51 +547,54 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
  * Note that the gradient vanishes as the values move away from 0.
  * The ReLULayer is often a better choice for this reason.
  */
-template <typename Dtype>
-class SigmoidLayer : public NeuronLayer<Dtype> {
- public:
-  explicit SigmoidLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Sigmoid"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = (1 + \exp(-x))^{-1}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *            = \frac{\partial E}{\partial y} y (1 - y)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class SigmoidLayer: public NeuronLayer<Dtype> {
+	public:
+		explicit SigmoidLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "Sigmoid";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = (1 + \exp(-x))^{-1}
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x}
+		 *            = \frac{\partial E}{\partial y} y (1 - y)
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -572,25 +603,25 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
- public:
-  explicit CuDNNSigmoidLayer(const LayerParameter& param)
-      : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNSigmoidLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
+	public:
+	explicit CuDNNSigmoidLayer(const LayerParameter& param)
+	: SigmoidLayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNSigmoidLayer();
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t handle_;
+	cudnnTensorDescriptor_t bottom_desc_;
+	cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -602,53 +633,56 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
  * Note that the gradient vanishes as the values move away from 0.
  * The ReLULayer is often a better choice for this reason.
  */
-template <typename Dtype>
-class TanHLayer : public NeuronLayer<Dtype> {
- public:
-  explicit TanHLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "TanH"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *            = \frac{\partial E}{\partial y}
-   *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
-   *            = \frac{\partial E}{\partial y} (1 - y^2)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class TanHLayer: public NeuronLayer<Dtype> {
+	public:
+		explicit TanHLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "TanH";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$; Backward fills their diff with
+		 *      gradients @f$
+		 *        \frac{\partial E}{\partial x}
+		 *            = \frac{\partial E}{\partial y}
+		 *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
+		 *            = \frac{\partial E}{\partial y} (1 - y^2)
+		 *      @f$ if propagate_down[0]
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -657,25 +691,25 @@ class TanHLayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNTanHLayer : public TanHLayer<Dtype> {
- public:
-  explicit CuDNNTanHLayer(const LayerParameter& param)
-      : TanHLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNTanHLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_;
-  cudnnTensorDescriptor_t top_desc_;
+	public:
+	explicit CuDNNTanHLayer(const LayerParameter& param)
+	: TanHLayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNTanHLayer();
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t handle_;
+	cudnnTensorDescriptor_t bottom_desc_;
+	cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -683,48 +717,51 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
  * @brief Tests whether the input exceeds a threshold: outputs 1 for inputs
  *        above threshold; 0 otherwise.
  */
-template <typename Dtype>
-class ThresholdLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ThresholdParameter threshold_param,
-   *     with ThresholdLayer options:
-   *   - threshold (\b optional, default 0).
-   *     the threshold value @f$ t @f$ to which the input values are compared.
-   */
-  explicit ThresholdLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Threshold"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *       y = \left\{
-   *       \begin{array}{lr}
-   *         0 & \mathrm{if} \; x \le t \\
+template<typename Dtype>
+class ThresholdLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides ThresholdParameter threshold_param,
+		 *     with ThresholdLayer options:
+		 *   - threshold (\b optional, default 0).
+		 *     the threshold value @f$ t @f$ to which the input values are compared.
+		 */
+		explicit ThresholdLayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Threshold";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs @f$
+		 *       y = \left\{
+		 *       \begin{array}{lr}
+		 *         0 & \mathrm{if} \; x \le t \\
    *         1 & \mathrm{if} \; x > t
-   *       \end{array} \right.
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /// @brief Not implemented (non-differentiable function)
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    NOT_IMPLEMENTED;
-  }
-
-  Dtype threshold_;
+		 *       \end{array} \right.
+		 *      @f$
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		/// @brief Not implemented (non-differentiable function)
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+			NOT_IMPLEMENTED;
+		}
+
+		Dtype threshold_;
 };
 
 /**
@@ -735,81 +772,84 @@ class ThresholdLayer : public NeuronLayer<Dtype> {
  *        channels. The number of axes of input blob should be greater than or
  *        equal to 2. The 1st axis (0-based) is seen as channels.
  */
-template <typename Dtype>
-class PReLULayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides PReLUParameter prelu_param,
-   *     with PReLULayer options:
-   *   - filler (\b optional, FillerParameter,
-   *     default {'type': constant 'value':0.25}).
-   *   - channel_shared (\b optional, default false).
-   *     negative slopes are shared across channels.
-   */
-  explicit PReLULayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "PReLU"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the computed outputs for each channel @f$i@f$ @f$
-   *        y_i = \max(0, x_i) + a_i \min(0, x_i)
-   *      @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the PReLU inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times ...) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
-   *      diff with gradients @f$
-   *        \frac{\partial E}{\partial x_i} = \left\{
-   *        \begin{array}{lr}
-   *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+template<typename Dtype>
+class PReLULayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides PReLUParameter prelu_param,
+		 *     with PReLULayer options:
+		 *   - filler (\b optional, FillerParameter,
+		 *     default {'type': constant 'value':0.25}).
+		 *   - channel_shared (\b optional, default false).
+		 *     negative slopes are shared across channels.
+		 */
+		explicit PReLULayer(const LayerParameter& param)
+			: NeuronLayer<Dtype>(param) {
+		}
+
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "PReLU";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times ...) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times ...) @f$
+		 *      the computed outputs for each channel @f$i@f$ @f$
+		 *        y_i = \max(0, x_i) + a_i \min(0, x_i)
+		 *      @f$.
+		 */
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		/**
+		 * @brief Computes the error gradient w.r.t. the PReLU inputs.
+		 *
+		 * @param top output Blob vector (length 1), providing the error gradient with
+		 *      respect to the outputs
+		 *   -# @f$ (N \times C \times ...) @f$
+		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+		 *      with respect to computed outputs @f$ y @f$
+		 * @param propagate_down see Layer::Backward.
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times ...) @f$
+		 *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
+		 *      diff with gradients @f$
+		 *        \frac{\partial E}{\partial x_i} = \left\{
+		 *        \begin{array}{lr}
+		 *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0
-   *        \end{array} \right.
-   *      @f$.
-   *      If param_propagate_down_[0] is true, it fills the diff with gradients
-   *      @f$
-   *        \frac{\partial E}{\partial a_i} = \left\{
-   *        \begin{array}{lr}
-   *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+		 *        \end{array} \right.
+		 *      @f$.
+		 *      If param_propagate_down_[0] is true, it fills the diff with gradients
+		 *      @f$
+		 *        \frac{\partial E}{\partial a_i} = \left\{
+		 *        \begin{array}{lr}
+		 *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            0 & \mathrm{if} \; x_i > 0
-   *        \end{array} \right.
-   *      @f$.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool channel_shared_;
-  Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
-  Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
-  Blob<Dtype> bottom_memory_;  // memory for in-place computation
+		 *        \end{array} \right.
+		 *      @f$.
+		 */
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		bool channel_shared_;
+		Blob<Dtype> multiplier_; // dot multiplier for backward computation of params
+		Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
+		Blob<Dtype> bottom_memory_;  // memory for in-place computation
 };
 
 }  // namespace caffe
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 19cf18c9..653f5e36 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -10,56 +10,59 @@ namespace bp = boost::python;
 
 namespace caffe {
 
-template <typename Dtype>
-class PythonLayer : public Layer<Dtype> {
- public:
-  PythonLayer(PyObject* self, const LayerParameter& param)
-      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
+template<typename Dtype>
+class PythonLayer: public Layer<Dtype> {
+	public:
+		PythonLayer(PyObject* self, const LayerParameter& param)
+			: Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
+		}
 
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("setup")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
-  }
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			try {
+				self_.attr("setup")(bottom, top);
+			} catch (bp::error_already_set) {
+				PyErr_Print();
+				throw;
+			}
+		}
 
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("reshape")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
-  }
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			try {
+				self_.attr("reshape")(bottom, top);
+			} catch (bp::error_already_set) {
+				PyErr_Print();
+				throw;
+			}
+		}
 
-  virtual inline const char* type() const { return "Python"; }
+		virtual inline const char* type() const {
+			return "Python";
+		}
 
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("forward")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
-  }
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    try {
-      self_.attr("backward")(top, propagate_down, bottom);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
-  }
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top) {
+			try {
+				self_.attr("forward")(bottom, top);
+			} catch (bp::error_already_set) {
+				PyErr_Print();
+				throw;
+			}
+		}
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+			try {
+				self_.attr("backward")(top, propagate_down, bottom);
+			} catch (bp::error_already_set) {
+				PyErr_Print();
+				throw;
+			}
+		}
 
- private:
-  bp::object self_;
+	private:
+		bp::object self_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 79285a4a..688fb99f 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -14,151 +14,169 @@ namespace caffe {
  * Requires implementation of ApplyUpdate to compute a parameter update
  * given the current state of the Net parameters.
  */
-template <typename Dtype>
+template<typename Dtype>
 class Solver {
- public:
-  explicit Solver(const SolverParameter& param);
-  explicit Solver(const string& param_file);
-  void Init(const SolverParameter& param);
-  void InitTrainNet();
-  void InitTestNets();
-  // The main entry of the solver function. In default, iter will be zero. Pass
-  // in a non-zero iter number to resume training for a pre-trained net.
-  virtual void Solve(const char* resume_file = NULL);
-  inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
-  void Step(int iters);
-  // The Restore function implements how one should restore the solver to a
-  // previously snapshotted state. You should implement the RestoreSolverState()
-  // function that restores the state from a SolverState protocol buffer.
-  void Restore(const char* resume_file);
-  virtual ~Solver() {}
-  inline shared_ptr<Net<Dtype> > net() { return net_; }
-  inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
-    return test_nets_;
-  }
-  int iter() { return iter_; }
-
- protected:
-  // Make and apply the update value for the current iteration.
-  virtual void ApplyUpdate() = 0;
-  // The Solver::Snapshot function implements the basic snapshotting utility
-  // that stores the learned net. You should implement the SnapshotSolverState()
-  // function that produces a SolverState protocol buffer that needs to be
-  // written to disk together with the learned net.
-  void Snapshot();
-  // The test routine
-  void TestAll();
-  void Test(const int test_net_id = 0);
-  virtual void SnapshotSolverState(SolverState* state) = 0;
-  virtual void RestoreSolverState(const SolverState& state) = 0;
-
-  void DisplayOutputBlobs(const int net_id);
-
-  SolverParameter param_;
-  int iter_;
-  int current_step_;
-  shared_ptr<Net<Dtype> > net_;
-  vector<shared_ptr<Net<Dtype> > > test_nets_;
- 
- void ocl_setup();
- protected:
- cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-
-  DISABLE_COPY_AND_ASSIGN(Solver);
+	public:
+		explicit Solver(const SolverParameter& param);
+		explicit Solver(const string& param_file);
+		void Init(const SolverParameter& param);
+		void InitTrainNet();
+		void InitTestNets();
+		// The main entry of the solver function. In default, iter will be zero. Pass
+		// in a non-zero iter number to resume training for a pre-trained net.
+		virtual void Solve(const char* resume_file = NULL);
+		inline void Solve(const string resume_file) {
+			Solve(resume_file.c_str());
+		}
+		void Step(int iters);
+		// The Restore function implements how one should restore the solver to a
+		// previously snapshotted state. You should implement the RestoreSolverState()
+		// function that restores the state from a SolverState protocol buffer.
+		void Restore(const char* resume_file);
+		virtual ~Solver() {
+		}
+		inline shared_ptr<Net<Dtype> > net() {
+			return net_;
+		}
+		inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
+			return test_nets_;
+		}
+		int iter() {
+			return iter_;
+		}
+
+	protected:
+		// Make and apply the update value for the current iteration.
+		virtual void ApplyUpdate() = 0;
+		// The Solver::Snapshot function implements the basic snapshotting utility
+		// that stores the learned net. You should implement the SnapshotSolverState()
+		// function that produces a SolverState protocol buffer that needs to be
+		// written to disk together with the learned net.
+		void Snapshot();
+		// The test routine
+		void TestAll();
+		void Test(const int test_net_id = 0);
+		virtual void SnapshotSolverState(SolverState* state) = 0;
+		virtual void RestoreSolverState(const SolverState& state) = 0;
+
+		void DisplayOutputBlobs(const int net_id);
+
+		SolverParameter param_;
+		int iter_;
+		int current_step_;
+		shared_ptr<Net<Dtype> > net_;
+		vector<shared_ptr<Net<Dtype> > > test_nets_;
+
+		void ocl_setup();
+		protected:
+		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+		DISABLE_COPY_AND_ASSIGN (Solver);
 };
 
-
 /**
  * @brief Optimizes the parameters of a Net using
  *        stochastic gradient descent (SGD) with momentum.
  */
-template <typename Dtype>
-class SGDSolver : public Solver<Dtype> {
- public:
-  explicit SGDSolver(const SolverParameter& param)
-      : Solver<Dtype>(param) { PreSolve(); }
-  explicit SGDSolver(const string& param_file)
-      : Solver<Dtype>(param_file) { PreSolve(); }
-
-  const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
-
- protected:
-  void PreSolve();
-  Dtype GetLearningRate();
-  virtual void ApplyUpdate();
-  virtual void Normalize(int param_id);
-  virtual void Regularize(int param_id);
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  virtual void ClipGradients();
-  virtual void SnapshotSolverState(SolverState * state);
-  virtual void RestoreSolverState(const SolverState& state);
-  // history maintains the historical momentum data.
-  // update maintains update related data and is not needed in snapshots.
-  // temp maintains other information that might be needed in computation
-  //   of gradients/updates and is not needed in snapshots
-  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
-
- void ocl_setup();
- protected:
- cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-
-  DISABLE_COPY_AND_ASSIGN(SGDSolver);
+template<typename Dtype>
+class SGDSolver: public Solver<Dtype> {
+	public:
+		explicit SGDSolver(const SolverParameter& param)
+			: Solver<Dtype>(param) {
+			PreSolve();
+		}
+		explicit SGDSolver(const string& param_file)
+			: Solver<Dtype>(param_file) {
+			PreSolve();
+		}
+
+		const vector<shared_ptr<Blob<Dtype> > >& history() {
+			return history_;
+		}
+
+	protected:
+		void PreSolve();
+		Dtype GetLearningRate();
+		virtual void ApplyUpdate();
+		virtual void Normalize(int param_id);
+		virtual void Regularize(int param_id);
+		virtual void ComputeUpdateValue(int param_id, Dtype rate);
+		virtual void ClipGradients();
+		virtual void SnapshotSolverState(SolverState * state);
+		virtual void RestoreSolverState(const SolverState& state);
+		// history maintains the historical momentum data.
+		// update maintains update related data and is not needed in snapshots.
+		// temp maintains other information that might be needed in computation
+		//   of gradients/updates and is not needed in snapshots
+		vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
+
+		void ocl_setup();
+		protected:
+		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+		DISABLE_COPY_AND_ASSIGN (SGDSolver);
 };
 
-template <typename Dtype>
-class NesterovSolver : public SGDSolver<Dtype> {
- public:
-  explicit NesterovSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) {}
-  explicit NesterovSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) {}
+template<typename Dtype>
+class NesterovSolver: public SGDSolver<Dtype> {
+	public:
+		explicit NesterovSolver(const SolverParameter& param)
+			: SGDSolver<Dtype>(param) {
+		}
+		explicit NesterovSolver(const string& param_file)
+			: SGDSolver<Dtype>(param_file) {
+		}
 
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+	protected:
+		virtual void ComputeUpdateValue(int param_id, Dtype rate);
 
- void ocl_setup();
- protected:
- cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+		void ocl_setup();
+		protected:
+		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
 
-  DISABLE_COPY_AND_ASSIGN(NesterovSolver);
+		DISABLE_COPY_AND_ASSIGN (NesterovSolver);
 };
 
-template <typename Dtype>
-class AdaGradSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaGradSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit AdaGradSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with AdaGrad.";
-  }
-
- void ocl_setup();
- protected:
- cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-  DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
+template<typename Dtype>
+class AdaGradSolver: public SGDSolver<Dtype> {
+	public:
+		explicit AdaGradSolver(const SolverParameter& param)
+			: SGDSolver<Dtype>(param) {
+			constructor_sanity_check();
+		}
+		explicit AdaGradSolver(const string& param_file)
+			: SGDSolver<Dtype>(param_file) {
+			constructor_sanity_check();
+		}
+
+	protected:
+		virtual void ComputeUpdateValue(int param_id, Dtype rate);
+		void constructor_sanity_check() {
+			CHECK_EQ(0, this->param_.momentum())
+				<< "Momentum cannot be used with AdaGrad.";
+		}
+
+		void ocl_setup();
+		protected:
+		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+		DISABLE_COPY_AND_ASSIGN (AdaGradSolver);
 };
 
-template <typename Dtype>
+template<typename Dtype>
 Solver<Dtype>* GetSolver(const SolverParameter& param) {
-  SolverParameter_SolverType type = param.solver_type();
-
-  switch (type) {
-  case SolverParameter_SolverType_SGD:
-      return new SGDSolver<Dtype>(param);
-  case SolverParameter_SolverType_NESTEROV:
-      return new NesterovSolver<Dtype>(param);
-  case SolverParameter_SolverType_ADAGRAD:
-      return new AdaGradSolver<Dtype>(param);
-  default:
-      LOG(FATAL) << "Unknown SolverType: " << type;
-  }
-  return (Solver<Dtype>*) NULL;
+	SolverParameter_SolverType type = param.solver_type();
+
+	switch (type) {
+		case SolverParameter_SolverType_SGD:
+			return new SGDSolver<Dtype>(param);
+		case SolverParameter_SolverType_NESTEROV:
+			return new NesterovSolver<Dtype>(param);
+		case SolverParameter_SolverType_ADAGRAD:
+			return new AdaGradSolver<Dtype>(param);
+		default:
+			LOG(FATAL) << "Unknown SolverType: " << type;
+	}
+	return (Solver<Dtype>*) NULL;
 }
 
 }  // namespace caffe
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 0fe6546d..0b053a48 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -49,15 +49,14 @@ namespace caffe {
 // does not seem to create a memory bottleneck here.
 
 inline void CaffeMallocHost(void** ptr, size_t size) {
-  *ptr = malloc(size);
-  CHECK(*ptr) << "host allocation of size " << size << " failed";
+	*ptr = malloc(size);
+	CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr) {
-  free(ptr);
+	free(ptr);
 }
 
-
 /**
  * @brief Manages memory allocation and synchronization between the host (CPU)
  *        and device (GPU).
@@ -65,47 +64,56 @@ inline void CaffeFreeHost(void* ptr) {
  * TODO(dox): more thorough description.
  */
 class SyncedMemory {
- public:
-  SyncedMemory()
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), data_layer_(false) {
-        ocl_setup();
-        }
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), data_layer_(false) {
-        ocl_setup();
-        }
+	public:
+		SyncedMemory()
+			: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+				own_cpu_data_(false), data_layer_(false) {
+			ocl_setup();
+		}
+		explicit SyncedMemory(size_t size)
+			: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+				own_cpu_data_(false), data_layer_(false) {
+			ocl_setup();
+		}
 
-  ~SyncedMemory();
-  const void* cpu_data();
-  void set_cpu_data(void* data);
-  const void* gpu_data();
-  const void* gpu_cache_data();
-  void* mutable_cpu_data();
-  void* mutable_gpu_data();
-  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
-  void set_data_layer(){ data_layer_ = true; }
- private:
-   void ocl_setup();
- protected:
-   cl_kernel oclmem_kernel;
+		~SyncedMemory();
+		const void* cpu_data();
+		void set_cpu_data(void* data);
+		const void* gpu_data();
+		const void* gpu_cache_data();
+		void* mutable_cpu_data();
+		void* mutable_gpu_data();
+		enum SyncedHead {
+			UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED
+		};
+		SyncedHead head() {
+			return head_;
+		}
+		size_t size() {
+			return size_;
+		}
+		void set_data_layer() {
+			data_layer_ = true;
+		}
+	private:
+		void ocl_setup();
+		protected:
+		cl_kernel oclmem_kernel;
 
- private:
-  void to_cpu();
-  void to_gpu();
-  void* cpu_ptr_;
-  void* gpu_ptr_;
-  void* gpu_cache_ptr_;
-  size_t size_;
-  SyncedHead head_;
-  bool own_cpu_data_;
-  bool data_layer_;
-  DISABLE_COPY_AND_ASSIGN(SyncedMemory);
-};  // class SyncedMemory
+	private:
+		void to_cpu();
+		void to_gpu();
+		void* cpu_ptr_;
+		void* gpu_ptr_;
+		void* gpu_cache_ptr_;
+		size_t size_;
+		SyncedHead head_;
+		bool own_cpu_data_;
+		bool data_layer_;
+		DISABLE_COPY_AND_ASSIGN (SyncedMemory);
+};
+// class SyncedMemory
 
-}  // namespace caffe
+}// namespace caffe
 
 #endif  // CAFFE_SYNCEDMEM_HPP_
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index b4f8f284..179e31ca 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -15,61 +15,62 @@ using std::cout;
 using std::endl;
 
 #ifdef CMAKE_BUILD
-  #include "caffe_config.h"
+#include "caffe_config.h"
 #else
-  #define OPENCL_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+#define OPENCL_TEST_DEVICE -1
+#define CMAKE_SOURCE_DIR "src/"
+#define EXAMPLES_SOURCE_DIR "examples/"
+#define CMAKE_EXT ""
 #endif
 
 int main(int argc, char** argv);
 
 namespace caffe {
 
-template <typename TypeParam>
-class MultiDeviceTest : public ::testing::Test {
- public:
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  MultiDeviceTest() {
-    Caffe::set_mode(TypeParam::device);
-  }
-  virtual ~MultiDeviceTest() {}
+template<typename TypeParam>
+class MultiDeviceTest: public ::testing::Test {
+	public:
+		typedef typename TypeParam::Dtype Dtype;
+		protected:
+		MultiDeviceTest() {
+			Caffe::set_mode(TypeParam::device);
+		}
+		virtual ~MultiDeviceTest() {
+		}
 };
 
 typedef ::testing::Types<float, double> TestDtypes;
 
-template <typename TypeParam>
+template<typename TypeParam>
 struct CPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::CPU;
+		typedef TypeParam Dtype;
+		static const Caffe::Brew device = Caffe::CPU;
 };
 
-template <typename Dtype>
-class CPUDeviceTest : public MultiDeviceTest<CPUDevice<Dtype> > {
+template<typename Dtype>
+class CPUDeviceTest: public MultiDeviceTest<CPUDevice<Dtype> > {
 };
 
 #ifdef CPU_ONLY
 
 typedef ::testing::Types<CPUDevice<float>,
-                         CPUDevice<double> > TestDtypesAndDevices;
+CPUDevice<double> > TestDtypesAndDevices;
 
 #else
 
-template <typename TypeParam>
+template<typename TypeParam>
 struct GPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::GPU;
+		typedef TypeParam Dtype;
+		static const Caffe::Brew device = Caffe::GPU;
 };
 
-template <typename Dtype>
-class GPUDeviceTest : public MultiDeviceTest<GPUDevice<Dtype> > {
+template<typename Dtype>
+class GPUDeviceTest: public MultiDeviceTest<GPUDevice<Dtype> > {
 };
 
 typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>,
-                         GPUDevice<float>, GPUDevice<double> >
-                         TestDtypesAndDevices;
+	GPUDevice<float>, GPUDevice<double> >
+TestDtypesAndDevices;
 
 #endif
 
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index cc5dcbad..07fe69cf 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -15,244 +15,244 @@ namespace caffe {
 
 // The gradient checker adds a L2 normalization loss function on top of the
 // top blobs, and checks the gradient.
-template <typename Dtype>
+template<typename Dtype>
 class GradientChecker {
- public:
-  // kink and kink_range specify an ignored nonsmooth region of the form
-  // kink - kink_range <= |feature value| <= kink + kink_range,
-  // which accounts for all nonsmoothness in use by caffe
-  GradientChecker(const Dtype stepsize, const Dtype threshold,
-      const unsigned int seed = 1701, const Dtype kink = 0.,
-      const Dtype kink_range = -1)
-      : stepsize_(stepsize), threshold_(threshold), seed_(seed),
-        kink_(kink), kink_range_(kink_range) {}
-  // Checks the gradient of a layer, with provided bottom layers and top
-  // layers.
-  // Note that after the gradient check, we do not guarantee that the data
-  // stored in the layer parameters and the blobs are unchanged.
-  void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
-      layer->SetUp(bottom, top);
-      CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
-  }
-  void CheckGradientExhaustive(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom = -1);
+	public:
+		// kink and kink_range specify an ignored nonsmooth region of the form
+		// kink - kink_range <= |feature value| <= kink + kink_range,
+		// which accounts for all nonsmoothness in use by caffe
+		GradientChecker(const Dtype stepsize, const Dtype threshold,
+			const unsigned int seed = 1701, const Dtype kink = 0.,
+			const Dtype kink_range = -1)
+			: stepsize_(stepsize), threshold_(threshold), seed_(seed),
+				kink_(kink), kink_range_(kink_range) {
+		}
+		// Checks the gradient of a layer, with provided bottom layers and top
+		// layers.
+		// Note that after the gradient check, we do not guarantee that the data
+		// stored in the layer parameters and the blobs are unchanged.
+		void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
+			layer->SetUp(bottom, top);
+			CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
+		}
+		void CheckGradientExhaustive(Layer<Dtype>* layer,
+			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+			int check_bottom = -1);
 
-  // CheckGradientEltwise can be used to test layers that perform element-wise
-  // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
-  // i != j.
-  void CheckGradientEltwise(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+		// CheckGradientEltwise can be used to test layers that perform element-wise
+		// computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
+		// i != j.
+		void CheckGradientEltwise(Layer<Dtype>* layer,
+			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
-  void CheckGradientSingle(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom, int top_id, int top_data_id, bool element_wise = false);
+		void CheckGradientSingle(Layer<Dtype>* layer,
+			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+			int check_bottom, int top_id, int top_data_id, bool element_wise = false);
 
-  // Checks the gradient of a network. This network should not have any data
-  // layers or loss layers, since the function does not explicitly deal with
-  // such cases yet. All input blobs and parameter blobs are going to be
-  // checked, layer-by-layer to avoid numerical problems to accumulate.
-  void CheckGradientNet(const Net<Dtype>& net,
-      const vector<Blob<Dtype>*>& input);
+		// Checks the gradient of a network. This network should not have any data
+		// layers or loss layers, since the function does not explicitly deal with
+		// such cases yet. All input blobs and parameter blobs are going to be
+		// checked, layer-by-layer to avoid numerical problems to accumulate.
+		void CheckGradientNet(const Net<Dtype>& net,
+			const vector<Blob<Dtype>*>& input);
 
- protected:
-  Dtype GetObjAndGradient(const Layer<Dtype>& layer,
-      const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
-  Dtype stepsize_;
-  Dtype threshold_;
-  unsigned int seed_;
-  Dtype kink_;
-  Dtype kink_range_;
+	protected:
+		Dtype GetObjAndGradient(const Layer<Dtype>& layer,
+			const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
+		Dtype stepsize_;
+		Dtype threshold_;
+		unsigned int seed_;
+		Dtype kink_;
+		Dtype kink_range_;
 };
 
-
-template <typename Dtype>
+template<typename Dtype>
 void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-    int check_bottom, int top_id, int top_data_id, bool element_wise) {
-  if (element_wise) {
-    CHECK_EQ(0, layer->blobs().size());
-    CHECK_LE(0, top_id);
-    CHECK_LE(0, top_data_id);
-    const int top_count = top[top_id]->count();
-    for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
-      CHECK_EQ(top_count, bottom[blob_id]->count());
-    }
-  }
-  // First, figure out what blobs we need to check against, and zero init
-  // parameter blobs.
-  vector<Blob<Dtype>*> blobs_to_check;
-  vector<bool> propagate_down(bottom.size(), check_bottom < 0);
-  for (int i = 0; i < layer->blobs().size(); ++i) {
-    Blob<Dtype>* blob = layer->blobs()[i].get();
-    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
-    blobs_to_check.push_back(blob);
-  }
-  if (check_bottom < 0) {
-    for (int i = 0; i < bottom.size(); ++i) {
-      blobs_to_check.push_back(bottom[i]);
-    }
-  } else {
-    CHECK_LT(check_bottom, bottom.size());
-    blobs_to_check.push_back(bottom[check_bottom]);
-    propagate_down[check_bottom] = true;
-  }
-  // Compute the gradient analytically using Backward
-  Caffe::set_random_seed(seed_);
-  // Ignore the loss from the layer (it's just the weighted sum of the losses
-  // from the top blobs, whose gradients we may want to test individually).
-  layer->Forward(bottom, top);
-  // Get additional loss from the objective
-  GetObjAndGradient(*layer, top, top_id, top_data_id);
-  layer->Backward(top, propagate_down, bottom);
-  // Store computed gradients for all checked blobs
-  vector<shared_ptr<Blob<Dtype> > >
-      computed_gradient_blobs(blobs_to_check.size());
-  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-    computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
-    computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
-    const int count = blobs_to_check[blob_id]->count();
-    const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
-    Dtype* computed_gradients =
-        computed_gradient_blobs[blob_id]->mutable_cpu_data();
-    caffe_copy(count, diff, computed_gradients);
-  }
-  // Compute derivative of top w.r.t. each bottom and parameter input using
-  // finite differencing.
-  // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
-  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-    const Dtype* computed_gradients =
-        computed_gradient_blobs[blob_id]->cpu_data();
-    // LOG(ERROR) << "Blob " << blob_id << ": checking "
-    //     << current_blob->count() << " parameters.";
-    for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
-      // For an element-wise layer, we only need to do finite differencing to
-      // compute the derivative of top[top_id][top_data_id] w.r.t.
-      // bottom[blob_id][i] only for i == top_data_id.  For any other
-      // i != top_data_id, we know the derivative is 0 by definition, and simply
-      // check that that's true.
-      Dtype estimated_gradient = 0;
-      Dtype positive_objective = 0;
-      Dtype negative_objective = 0;
-      if (!element_wise || (feat_id == top_data_id)) {
-        // Do finite differencing.
-        // Compute loss with stepsize_ added to input.
-        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-        Caffe::set_random_seed(seed_);
-        layer->Forward(bottom, top);
-        positive_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
-        // Compute loss with stepsize_ subtracted from input.
-        current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
-        Caffe::set_random_seed(seed_);
-        layer->Forward(bottom, top);
-        negative_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
-        // Recover original input value.
-        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-        estimated_gradient = (positive_objective - negative_objective) /
-            stepsize_ / 2.;
-      }
-      Dtype computed_gradient = computed_gradients[feat_id];
-      Dtype feature = current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
-      //     << current_blob->cpu_diff()[feat_id];
-      if (kink_ - kink_range_ > fabs(feature)
-          || fabs(feature) > kink_ + kink_range_) {
-        // We check relative accuracy, but for too small values, we threshold
-        // the scale factor by 1.
-        Dtype scale = std::max(
-            std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
-        EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
-          << "debug: (top_id, top_data_id, blob_id, feat_id)="
-          << top_id << "," << top_data_id << "," << blob_id << "," << feat_id
-          << "; feat = " << feature
-          << "; objective+ = " << positive_objective
-          << "; objective- = " << negative_objective;
-      }
-      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
-      // LOG(ERROR) << "computed gradient: " << computed_gradient
-      //    << " estimated_gradient: " << estimated_gradient;
-    }
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+	int check_bottom, int top_id, int top_data_id, bool element_wise) {
+	if (element_wise) {
+		CHECK_EQ(0, layer->blobs().size());
+		CHECK_LE(0, top_id);
+		CHECK_LE(0, top_data_id);
+		const int top_count = top[top_id]->count();
+		for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
+			CHECK_EQ(top_count, bottom[blob_id]->count());
+		}
+	}
+	// First, figure out what blobs we need to check against, and zero init
+	// parameter blobs.
+	vector<Blob<Dtype>*> blobs_to_check;
+	vector<bool> propagate_down(bottom.size(), check_bottom < 0);
+	for (int i = 0; i < layer->blobs().size(); ++i) {
+		Blob<Dtype>* blob = layer->blobs()[i].get();
+		caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
+		blobs_to_check.push_back(blob);
+	}
+	if (check_bottom < 0) {
+		for (int i = 0; i < bottom.size(); ++i) {
+			blobs_to_check.push_back(bottom[i]);
+		}
+	} else {
+		CHECK_LT(check_bottom, bottom.size());
+		blobs_to_check.push_back(bottom[check_bottom]);
+		propagate_down[check_bottom] = true;
+	}
+	// Compute the gradient analytically using Backward
+	Caffe::set_random_seed(seed_);
+	// Ignore the loss from the layer (it's just the weighted sum of the losses
+	// from the top blobs, whose gradients we may want to test individually).
+	layer->Forward(bottom, top);
+	// Get additional loss from the objective
+	GetObjAndGradient(*layer, top, top_id, top_data_id);
+	layer->Backward(top, propagate_down, bottom);
+	// Store computed gradients for all checked blobs
+	vector < shared_ptr<Blob<Dtype> > >
+		computed_gradient_blobs(blobs_to_check.size());
+	for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
+		Blob<Dtype>* current_blob = blobs_to_check[blob_id];
+		computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
+		computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
+		const int count = blobs_to_check[blob_id]->count();
+		const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
+		Dtype* computed_gradients =
+			computed_gradient_blobs[blob_id]->mutable_cpu_data();
+		caffe_copy(count, diff, computed_gradients);
+	}
+	// Compute derivative of top w.r.t. each bottom and parameter input using
+	// finite differencing.
+	// LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
+	for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
+		Blob<Dtype>* current_blob = blobs_to_check[blob_id];
+		const Dtype* computed_gradients =
+			computed_gradient_blobs[blob_id]->cpu_data();
+		// LOG(ERROR) << "Blob " << blob_id << ": checking "
+		//     << current_blob->count() << " parameters.";
+		for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
+			// For an element-wise layer, we only need to do finite differencing to
+			// compute the derivative of top[top_id][top_data_id] w.r.t.
+			// bottom[blob_id][i] only for i == top_data_id.  For any other
+			// i != top_data_id, we know the derivative is 0 by definition, and simply
+			// check that that's true.
+			Dtype estimated_gradient = 0;
+			Dtype positive_objective = 0;
+			Dtype negative_objective = 0;
+			if (!element_wise || (feat_id == top_data_id)) {
+				// Do finite differencing.
+				// Compute loss with stepsize_ added to input.
+				current_blob->mutable_cpu_data()[feat_id] += stepsize_;
+				Caffe::set_random_seed(seed_);
+				layer->Forward(bottom, top);
+				positive_objective =
+					GetObjAndGradient(*layer, top, top_id, top_data_id);
+				// Compute loss with stepsize_ subtracted from input.
+				current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
+				Caffe::set_random_seed(seed_);
+				layer->Forward(bottom, top);
+				negative_objective =
+					GetObjAndGradient(*layer, top, top_id, top_data_id);
+				// Recover original input value.
+				current_blob->mutable_cpu_data()[feat_id] += stepsize_;
+				estimated_gradient = (positive_objective - negative_objective) /
+					stepsize_ / 2.;
+			}
+			Dtype computed_gradient = computed_gradients[feat_id];
+			Dtype feature = current_blob->cpu_data()[feat_id];
+			// LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
+			//     << current_blob->cpu_diff()[feat_id];
+			if (kink_ - kink_range_ > fabs(feature)
+				|| fabs(feature) > kink_ + kink_range_) {
+				// We check relative accuracy, but for too small values, we threshold
+				// the scale factor by 1.
+				Dtype scale = std::max(
+					std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
+				EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
+					<< "debug: (top_id, top_data_id, blob_id, feat_id)="
+					<< top_id << "," << top_data_id << "," << blob_id << "," << feat_id
+					<< "; feat = " << feature
+					<< "; objective+ = " << positive_objective
+					<< "; objective- = " << negative_objective;
+			}
+			// LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
+			// LOG(ERROR) << "computed gradient: " << computed_gradient
+			//    << " estimated_gradient: " << estimated_gradient;
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void GradientChecker<Dtype>::CheckGradientExhaustive(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-    int check_bottom) {
-  layer->SetUp(bottom, top);
-  CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
-  // LOG(ERROR) << "Exhaustive Mode.";
-  for (int i = 0; i < top.size(); ++i) {
-    // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
-    for (int j = 0; j < top[i]->count(); ++j) {
-      // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
-      CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
-    }
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+	int check_bottom) {
+	layer->SetUp(bottom, top);
+	CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
+	// LOG(ERROR) << "Exhaustive Mode.";
+	for (int i = 0; i < top.size(); ++i) {
+		// LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
+		for (int j = 0; j < top[i]->count(); ++j) {
+			// LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
+			CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void GradientChecker<Dtype>::CheckGradientEltwise(Layer<Dtype>* layer,
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  layer->SetUp(bottom, top);
-  CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
-  const int check_bottom = -1;
-  const bool element_wise = true;
-  for (int i = 0; i < top.size(); ++i) {
-    for (int j = 0; j < top[i]->count(); ++j) {
-      CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
-    }
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	layer->SetUp(bottom, top);
+	CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
+	const int check_bottom = -1;
+	const bool element_wise = true;
+	for (int i = 0; i < top.size(); ++i) {
+		for (int j = 0; j < top[i]->count(); ++j) {
+			CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void GradientChecker<Dtype>::CheckGradientNet(
-    const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
-  const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
-  vector<vector<Blob<Dtype>*> >& bottom_vecs = net.bottom_vecs();
-  vector<vector<Blob<Dtype>*> >& top_vecs = net.top_vecs();
-  for (int i = 0; i < layers.size(); ++i) {
-    net.Forward(input);
-    LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
-    CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
-  }
+	const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
+	const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
+	vector < vector<Blob<Dtype>*> > &bottom_vecs = net.bottom_vecs();
+	vector < vector<Blob<Dtype>*> > &top_vecs = net.top_vecs();
+	for (int i = 0; i < layers.size(); ++i) {
+		net.Forward(input);
+		LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
+		CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype GradientChecker<Dtype>::GetObjAndGradient(const Layer<Dtype>& layer,
-    const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
-  Dtype loss = 0;
-  if (top_id < 0) {
-    // the loss will be half of the sum of squares of all outputs
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* top_blob = top[i];
-      const Dtype* top_blob_data = top_blob->cpu_data();
-      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-      int count = top_blob->count();
-      for (int j = 0; j < count; ++j) {
-        loss += top_blob_data[j] * top_blob_data[j];
-      }
-      // set the diff: simply the data.
-      caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
-    }
-    loss /= 2.;
-  } else {
-    // the loss will be the top_data_id-th element in the top_id-th blob.
-    for (int i = 0; i < top.size(); ++i) {
-      Blob<Dtype>* top_blob = top[i];
-      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-      caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
-    }
-    const Dtype loss_weight = 2;
-    loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
-    top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
-  }
-  return loss;
+	const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
+	Dtype loss = 0;
+	if (top_id < 0) {
+		// the loss will be half of the sum of squares of all outputs
+		for (int i = 0; i < top.size(); ++i) {
+			Blob<Dtype>* top_blob = top[i];
+			const Dtype* top_blob_data = top_blob->cpu_data();
+			Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
+			int count = top_blob->count();
+			for (int j = 0; j < count; ++j) {
+				loss += top_blob_data[j] * top_blob_data[j];
+			}
+			// set the diff: simply the data.
+			caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
+		}
+		loss /= 2.;
+	} else {
+		// the loss will be the top_data_id-th element in the top_id-th blob.
+		for (int i = 0; i < top.size(); ++i) {
+			Blob<Dtype>* top_blob = top[i];
+			Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
+			caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
+		}
+		const Dtype loss_weight = 2;
+		loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
+		top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
+	}
+	return loss;
 }
 
 }  // namespace caffe
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
index 890f31bf..f5818f6f 100644
--- a/include/caffe/util/benchmark.hpp
+++ b/include/caffe/util/benchmark.hpp
@@ -8,43 +8,50 @@
 namespace caffe {
 
 class Timer {
- public:
-  Timer();
-  virtual ~Timer();
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
-  virtual float Seconds();
-
-  inline bool initted() { return initted_; }
-  inline bool running() { return running_; }
-  inline bool has_run_at_least_once() { return has_run_at_least_once_; }
-
- protected:
-  void Init();
-
-  bool initted_;
-  bool running_;
-  bool has_run_at_least_once_;
-#ifndef CPU_ONLY
-  //cudaEvent_t start_gpu_;
-  //cudaEvent_t stop_gpu_;
+	public:
+		Timer();
+		virtual ~Timer();
+		virtual void Start();
+		virtual void Stop();
+		virtual float MilliSeconds();
+		virtual float MicroSeconds();
+		virtual float Seconds();
+
+		inline bool initted() {
+			return initted_;
+		}
+		inline bool running() {
+			return running_;
+		}
+		inline bool has_run_at_least_once() {
+			return has_run_at_least_once_;
+		}
+
+	protected:
+		void Init();
+
+		bool initted_;
+		bool running_;
+		bool has_run_at_least_once_;
+		#ifndef CPU_ONLY
+		//cudaEvent_t start_gpu_;
+		//cudaEvent_t stop_gpu_;
 #endif
-  boost::posix_time::ptime start_cpu_;
-  boost::posix_time::ptime stop_cpu_;
-  float elapsed_milliseconds_;
-  float elapsed_microseconds_;
+		boost::posix_time::ptime start_cpu_;
+		boost::posix_time::ptime stop_cpu_;
+		float elapsed_milliseconds_;
+		float elapsed_microseconds_;
 };
 
-class CPUTimer : public Timer {
- public:
-  explicit CPUTimer();
-  virtual ~CPUTimer() {}
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
+class CPUTimer: public Timer {
+	public:
+		explicit CPUTimer();
+		virtual ~CPUTimer() {
+		}
+		virtual void Start();
+		virtual void Stop();
+		virtual float MilliSeconds();
+		virtual float MicroSeconds();
 };
 
 }  // namespace caffe
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index b531dd5f..4acca743 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -15,116 +15,116 @@
   } while (0)
 
 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-  }
-  return "Unknown cudnn status";
+	switch (status) {
+		case CUDNN_STATUS_SUCCESS:
+		return "CUDNN_STATUS_SUCCESS";
+		case CUDNN_STATUS_NOT_INITIALIZED:
+		return "CUDNN_STATUS_NOT_INITIALIZED";
+		case CUDNN_STATUS_ALLOC_FAILED:
+		return "CUDNN_STATUS_ALLOC_FAILED";
+		case CUDNN_STATUS_BAD_PARAM:
+		return "CUDNN_STATUS_BAD_PARAM";
+		case CUDNN_STATUS_INTERNAL_ERROR:
+		return "CUDNN_STATUS_INTERNAL_ERROR";
+		case CUDNN_STATUS_INVALID_VALUE:
+		return "CUDNN_STATUS_INVALID_VALUE";
+		case CUDNN_STATUS_ARCH_MISMATCH:
+		return "CUDNN_STATUS_ARCH_MISMATCH";
+		case CUDNN_STATUS_MAPPING_ERROR:
+		return "CUDNN_STATUS_MAPPING_ERROR";
+		case CUDNN_STATUS_EXECUTION_FAILED:
+		return "CUDNN_STATUS_EXECUTION_FAILED";
+		case CUDNN_STATUS_NOT_SUPPORTED:
+		return "CUDNN_STATUS_NOT_SUPPORTED";
+		case CUDNN_STATUS_LICENSE_ERROR:
+		return "CUDNN_STATUS_LICENSE_ERROR";
+	}
+	return "Unknown cudnn status";
 }
 
 namespace caffe {
 
-namespace cudnn {
-
-template <typename Dtype> class dataType;
-template<> class dataType<float>  {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  static float oneval, zeroval;
-  static const void *one, *zero;
-};
-template<> class dataType<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  static double oneval, zeroval;
-  static const void *one, *zero;
-};
-
-template <typename Dtype>
-inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
-  CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w,
-    int stride_n, int stride_c, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
-        n, c, h, w, stride_n, stride_c, stride_h, stride_w));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  const int stride_w = 1;
-  const int stride_h = w * stride_w;
-  const int stride_c = h * stride_h;
-  const int stride_n = c * stride_c;
-  setTensor4dDesc<Dtype>(desc, n, c, h, w,
-                         stride_n, stride_c, stride_h, stride_w);
-}
-
-template <typename Dtype>
-inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      n, c, h, w));
-}
-
-template <typename Dtype>
-inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
-  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
-}
-
-template <typename Dtype>
-inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
-    cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
-    int pad_h, int pad_w, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
-      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
-}
-
-template <typename Dtype>
-inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
-    PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
-    int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
-  switch (poolmethod) {
-  case PoolingParameter_PoolMethod_MAX:
-    *mode = CUDNN_POOLING_MAX;
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-        pad_h, pad_w, stride_h, stride_w));
-}
-
-}  // namespace cudnn
+	namespace cudnn {
+
+		template <typename Dtype> class dataType;
+		template<> class dataType<float> {
+			public:
+			static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+			static float oneval, zeroval;
+			static const void *one, *zero;
+		};
+		template<> class dataType<double> {
+			public:
+			static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+			static double oneval, zeroval;
+			static const void *one, *zero;
+		};
+
+		template <typename Dtype>
+		inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
+			CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
+		}
+
+		template <typename Dtype>
+		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+			int n, int c, int h, int w,
+			int stride_n, int stride_c, int stride_h, int stride_w) {
+			CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
+					n, c, h, w, stride_n, stride_c, stride_h, stride_w));
+		}
+
+		template <typename Dtype>
+		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+			int n, int c, int h, int w) {
+			const int stride_w = 1;
+			const int stride_h = w * stride_w;
+			const int stride_c = h * stride_h;
+			const int stride_n = c * stride_c;
+			setTensor4dDesc<Dtype>(desc, n, c, h, w,
+				stride_n, stride_c, stride_h, stride_w);
+		}
+
+		template <typename Dtype>
+		inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
+			int n, int c, int h, int w) {
+			CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+			CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
+					n, c, h, w));
+		}
+
+		template <typename Dtype>
+		inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
+			CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
+		}
+
+		template <typename Dtype>
+		inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
+			cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
+			int pad_h, int pad_w, int stride_h, int stride_w) {
+			CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+					pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+		}
+
+		template <typename Dtype>
+		inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
+			PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
+			int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
+			switch (poolmethod) {
+				case PoolingParameter_PoolMethod_MAX:
+				*mode = CUDNN_POOLING_MAX;
+				break;
+				case PoolingParameter_PoolMethod_AVE:
+				*mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+				break;
+				default:
+				LOG(FATAL) << "Unknown pooling method.";
+			}
+			CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
+			CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
+					pad_h, pad_w, stride_h, stride_w));
+		}
+
+	}  // namespace cudnn
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp
index 59ec3d39..a65e3acf 100644
--- a/include/caffe/util/db.hpp
+++ b/include/caffe/util/db.hpp
@@ -6,43 +6,52 @@
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
-enum Mode { READ, WRITE, NEW };
+enum Mode {
+	READ, WRITE, NEW
+};
 
 class Cursor {
- public:
-  Cursor() { }
-  virtual ~Cursor() { }
-  virtual void SeekToFirst() = 0;
-  virtual void Next() = 0;
-  virtual string key() = 0;
-  virtual string value() = 0;
-  virtual bool valid() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Cursor);
+	public:
+		Cursor() {
+		}
+		virtual ~Cursor() {
+		}
+		virtual void SeekToFirst() = 0;
+		virtual void Next() = 0;
+		virtual string key() = 0;
+		virtual string value() = 0;
+		virtual bool valid() = 0;
+
+		DISABLE_COPY_AND_ASSIGN (Cursor);
 };
 
 class Transaction {
- public:
-  Transaction() { }
-  virtual ~Transaction() { }
-  virtual void Put(const string& key, const string& value) = 0;
-  virtual void Commit() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Transaction);
+	public:
+		Transaction() {
+		}
+		virtual ~Transaction() {
+		}
+		virtual void Put(const string& key, const string& value) = 0;
+		virtual void Commit() = 0;
+
+		DISABLE_COPY_AND_ASSIGN (Transaction);
 };
 
 class DB {
- public:
-  DB() { }
-  virtual ~DB() { }
-  virtual void Open(const string& source, Mode mode) = 0;
-  virtual void Close() = 0;
-  virtual Cursor* NewCursor() = 0;
-  virtual Transaction* NewTransaction() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(DB);
+	public:
+		DB() {
+		}
+		virtual ~DB() {
+		}
+		virtual void Open(const string& source, Mode mode) = 0;
+		virtual void Close() = 0;
+		virtual Cursor* NewCursor() = 0;
+		virtual Transaction* NewTransaction() = 0;
+
+		DISABLE_COPY_AND_ASSIGN (DB);
 };
 
 DB* GetDB(DataParameter::DB backend);
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index 10623554..d3716de7 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -8,65 +8,86 @@
 
 #include "caffe/util/db.hpp"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
-class LevelDBCursor : public Cursor {
- public:
-  explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
-  ~LevelDBCursor() { delete iter_; }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void Next() { iter_->Next(); }
-  virtual string key() { return iter_->key().ToString(); }
-  virtual string value() { return iter_->value().ToString(); }
-  virtual bool valid() { return iter_->Valid(); }
+class LevelDBCursor: public Cursor {
+	public:
+		explicit LevelDBCursor(leveldb::Iterator* iter)
+			: iter_(iter) {
+			SeekToFirst();
+		}
+		~LevelDBCursor() {
+			delete iter_;
+		}
+		virtual void SeekToFirst() {
+			iter_->SeekToFirst();
+		}
+		virtual void Next() {
+			iter_->Next();
+		}
+		virtual string key() {
+			return iter_->key().ToString();
+		}
+		virtual string value() {
+			return iter_->value().ToString();
+		}
+		virtual bool valid() {
+			return iter_->Valid();
+		}
 
- private:
-  leveldb::Iterator* iter_;
+	private:
+		leveldb::Iterator* iter_;
 };
 
-class LevelDBTransaction : public Transaction {
- public:
-  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) { CHECK_NOTNULL(db_); }
-  virtual void Put(const string& key, const string& value) {
-    batch_.Put(key, value);
-  }
-  virtual void Commit() {
-    leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
-    CHECK(status.ok()) << "Failed to write batch to leveldb "
-                       << std::endl << status.ToString();
-  }
+class LevelDBTransaction: public Transaction {
+	public:
+		explicit LevelDBTransaction(leveldb::DB* db)
+			: db_(db) {
+			CHECK_NOTNULL(db_);
+		}
+		virtual void Put(const string& key, const string& value) {
+			batch_.Put(key, value);
+		}
+		virtual void Commit() {
+			leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
+			CHECK(status.ok()) << "Failed to write batch to leveldb "
+				<< std::endl << status.ToString();
+		}
 
- private:
-  leveldb::DB* db_;
-  leveldb::WriteBatch batch_;
+	private:
+		leveldb::DB* db_;
+		leveldb::WriteBatch batch_;
 
-  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+		DISABLE_COPY_AND_ASSIGN (LevelDBTransaction);
 };
 
-class LevelDB : public DB {
- public:
-  LevelDB() : db_(NULL) { }
-  virtual ~LevelDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (db_ != NULL) {
-      delete db_;
-      db_ = NULL;
-    }
-  }
-  virtual LevelDBCursor* NewCursor() {
-    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
-  }
-  virtual LevelDBTransaction* NewTransaction() {
-    return new LevelDBTransaction(db_);
-  }
+class LevelDB: public DB {
+	public:
+		LevelDB()
+			: db_(NULL) {
+		}
+		virtual ~LevelDB() {
+			Close();
+		}
+		virtual void Open(const string& source, Mode mode);
+		virtual void Close() {
+			if (db_ != NULL) {
+				delete db_;
+				db_ = NULL;
+			}
+		}
+		virtual LevelDBCursor* NewCursor() {
+			return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
+		}
+		virtual LevelDBTransaction* NewTransaction() {
+			return new LevelDBTransaction(db_);
+		}
 
- private:
-  leveldb::DB* db_;
+	private:
+		leveldb::DB* db_;
 };
 
-
 }  // namespace db
 }  // namespace caffe
 
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index cc7c90af..06424c94 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -7,82 +7,97 @@
 
 #include "caffe/util/db.hpp"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 inline void MDB_CHECK(int mdb_status) {
-  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+	CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
 }
 
-class LMDBCursor : public Cursor {
- public:
-  explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
-    : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
-    SeekToFirst();
-  }
-  virtual ~LMDBCursor() {
-    mdb_cursor_close(mdb_cursor_);
-    mdb_txn_abort(mdb_txn_);
-  }
-  virtual void SeekToFirst() { Seek(MDB_FIRST); }
-  virtual void Next() { Seek(MDB_NEXT); }
-  virtual string key() {
-    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-  }
-  virtual string value() {
-    return string(static_cast<const char*>(mdb_value_.mv_data),
-        mdb_value_.mv_size);
-  }
-  virtual bool valid() { return valid_; }
+class LMDBCursor: public Cursor {
+	public:
+		explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
+			: mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
+			SeekToFirst();
+		}
+		virtual ~LMDBCursor() {
+			mdb_cursor_close(mdb_cursor_);
+			mdb_txn_abort(mdb_txn_);
+		}
+		virtual void SeekToFirst() {
+			Seek (MDB_FIRST);
+		}
+		virtual void Next() {
+			Seek (MDB_NEXT);
+		}
+		virtual string key() {
+			return string(static_cast<const char*>(mdb_key_.mv_data),
+				mdb_key_.mv_size);
+		}
+		virtual string value() {
+			return string(static_cast<const char*>(mdb_value_.mv_data),
+				mdb_value_.mv_size);
+		}
+		virtual bool valid() {
+			return valid_;
+		}
 
- private:
-  void Seek(MDB_cursor_op op) {
-    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
-    if (mdb_status == MDB_NOTFOUND) {
-      valid_ = false;
-    } else {
-      MDB_CHECK(mdb_status);
-      valid_ = true;
-    }
-  }
+	private:
+		void Seek(MDB_cursor_op op) {
+			int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+			if (mdb_status == MDB_NOTFOUND) {
+				valid_ = false;
+			} else {
+				MDB_CHECK(mdb_status);
+				valid_ = true;
+			}
+		}
 
-  MDB_txn* mdb_txn_;
-  MDB_cursor* mdb_cursor_;
-  MDB_val mdb_key_, mdb_value_;
-  bool valid_;
+		MDB_txn* mdb_txn_;
+		MDB_cursor* mdb_cursor_;
+		MDB_val mdb_key_, mdb_value_;
+		bool valid_;
 };
 
-class LMDBTransaction : public Transaction {
- public:
-  explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
-    : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { }
-  virtual void Put(const string& key, const string& value);
-  virtual void Commit() { MDB_CHECK(mdb_txn_commit(mdb_txn_)); }
+class LMDBTransaction: public Transaction {
+	public:
+		explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
+			: mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
+		}
+		virtual void Put(const string& key, const string& value);
+		virtual void Commit() {
+			MDB_CHECK(mdb_txn_commit(mdb_txn_));
+		}
 
- private:
-  MDB_dbi* mdb_dbi_;
-  MDB_txn* mdb_txn_;
+	private:
+		MDB_dbi* mdb_dbi_;
+		MDB_txn* mdb_txn_;
 
-  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+		DISABLE_COPY_AND_ASSIGN (LMDBTransaction);
 };
 
-class LMDB : public DB {
- public:
-  LMDB() : mdb_env_(NULL) { }
-  virtual ~LMDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (mdb_env_ != NULL) {
-      mdb_dbi_close(mdb_env_, mdb_dbi_);
-      mdb_env_close(mdb_env_);
-      mdb_env_ = NULL;
-    }
-  }
-  virtual LMDBCursor* NewCursor();
-  virtual LMDBTransaction* NewTransaction();
+class LMDB: public DB {
+	public:
+		LMDB()
+			: mdb_env_(NULL) {
+		}
+		virtual ~LMDB() {
+			Close();
+		}
+		virtual void Open(const string& source, Mode mode);
+		virtual void Close() {
+			if (mdb_env_ != NULL) {
+				mdb_dbi_close(mdb_env_, mdb_dbi_);
+				mdb_env_close(mdb_env_);
+				mdb_env_ = NULL;
+			}
+		}
+		virtual LMDBCursor* NewCursor();
+		virtual LMDBTransaction* NewTransaction();
 
- private:
-  MDB_env* mdb_env_;
-  MDB_dbi mdb_dbi_;
+	private:
+		MDB_env* mdb_env_;
+		MDB_dbi mdb_dbi_;
 };
 
 }  // namespace db
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index ba9c4aca..fda13567 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -29,79 +29,84 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, Dtype* data_col);
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, Dtype* data_im);
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_im, const int img_offset);
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_im, const int img_offset);
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col, const int col_offset);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_col, const int col_offset);
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, Dtype* data_col);
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
-
-template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset);
-
-template <typename Dtype>
-void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset);
-
-template <typename Dtype>
-void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset, int optnum);
-
-template <typename Dtype>
-void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset);
-
-template <typename Dtype>
-void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset, int optnum);
-
-template <typename Dtype>
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, Dtype* data_im);
+
+template<typename Dtype>
+void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, const int col_offset);
+
+template<typename Dtype>
+void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, const int col_offset);
+
+template<typename Dtype>
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, const int col_offset, int optnum);
+
+template<typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int psize, const int pad,
+	const int stride, Dtype* data_im, const int img_offset);
+
+template<typename Dtype>
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_im, const int img_offset, int optnum);
+
+template<typename Dtype>
 void col2im_gpu_ocl(cl_mem data_col, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, cl_kernel Kernel);
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_im, cl_kernel Kernel);
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_gpu_ocl(cl_mem data_im, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, cl_kernel Kernel);
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, cl_kernel Kernel);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp
index 446abb81..4c0d0106 100644
--- a/include/caffe/util/insert_splits.hpp
+++ b/include/caffe/util/insert_splits.hpp
@@ -12,14 +12,14 @@ namespace caffe {
 void InsertSplits(const NetParameter& param, NetParameter* param_split);
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-    const int blob_idx, const int split_count, const float loss_weight,
-    LayerParameter* split_layer_param);
+	const int blob_idx, const int split_count, const float loss_weight,
+	LayerParameter* split_layer_param);
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-    const int blob_idx);
+	const int blob_idx);
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-    const int blob_idx, const int split_idx);
+	const int blob_idx, const int split_idx);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 3a62c3c9..faef67e3 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -19,119 +19,118 @@ namespace caffe {
 using ::google::protobuf::Message;
 
 inline void MakeTempFilename(string* temp_filename) {
-  temp_filename->clear();
-  *temp_filename = "/tmp/caffe_test.XXXXXX";
-  char* temp_filename_cstr = new char[temp_filename->size() + 1];
-  // NOLINT_NEXT_LINE(runtime/printf)
-  strcpy(temp_filename_cstr, temp_filename->c_str());
-  int fd = mkstemp(temp_filename_cstr);
-  CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename;
-  close(fd);
-  *temp_filename = temp_filename_cstr;
-  delete[] temp_filename_cstr;
+	temp_filename->clear();
+	*temp_filename = "/tmp/caffe_test.XXXXXX";
+	char* temp_filename_cstr = new char[temp_filename->size() + 1];
+	// NOLINT_NEXT_LINE(runtime/printf)
+	strcpy(temp_filename_cstr, temp_filename->c_str());
+	int fd = mkstemp(temp_filename_cstr);
+	CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename;
+	close(fd);
+	*temp_filename = temp_filename_cstr;
+	delete[] temp_filename_cstr;
 }
 
 inline void MakeTempDir(string* temp_dirname) {
-  temp_dirname->clear();
-  *temp_dirname = "/tmp/caffe_test.XXXXXX";
-  char* temp_dirname_cstr = new char[temp_dirname->size() + 1];
-  // NOLINT_NEXT_LINE(runtime/printf)
-  strcpy(temp_dirname_cstr, temp_dirname->c_str());
-  char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
-  CHECK(mkdtemp_result != NULL)
-      << "Failed to create a temporary directory at: " << *temp_dirname;
-  *temp_dirname = temp_dirname_cstr;
-  delete[] temp_dirname_cstr;
+	temp_dirname->clear();
+	*temp_dirname = "/tmp/caffe_test.XXXXXX";
+	char* temp_dirname_cstr = new char[temp_dirname->size() + 1];
+	// NOLINT_NEXT_LINE(runtime/printf)
+	strcpy(temp_dirname_cstr, temp_dirname->c_str());
+	char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
+	CHECK(mkdtemp_result != NULL)
+		<< "Failed to create a temporary directory at: " << *temp_dirname;
+	*temp_dirname = temp_dirname_cstr;
+	delete[] temp_dirname_cstr;
 }
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromTextFile(const string& filename, Message* proto) {
-  return ReadProtoFromTextFile(filename.c_str(), proto);
+	return ReadProtoFromTextFile(filename.c_str(), proto);
 }
 
 inline void ReadProtoFromTextFileOrDie(const char* filename, Message* proto) {
-  CHECK(ReadProtoFromTextFile(filename, proto));
+	CHECK(ReadProtoFromTextFile(filename, proto));
 }
 
 inline void ReadProtoFromTextFileOrDie(const string& filename, Message* proto) {
-  ReadProtoFromTextFileOrDie(filename.c_str(), proto);
+	ReadProtoFromTextFileOrDie(filename.c_str(), proto);
 }
 
 void WriteProtoToTextFile(const Message& proto, const char* filename);
 inline void WriteProtoToTextFile(const Message& proto, const string& filename) {
-  WriteProtoToTextFile(proto, filename.c_str());
+	WriteProtoToTextFile(proto, filename.c_str());
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromBinaryFile(const string& filename, Message* proto) {
-  return ReadProtoFromBinaryFile(filename.c_str(), proto);
+	return ReadProtoFromBinaryFile(filename.c_str(), proto);
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) {
-  CHECK(ReadProtoFromBinaryFile(filename, proto));
+	CHECK(ReadProtoFromBinaryFile(filename, proto));
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const string& filename,
-                                         Message* proto) {
-  ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
+	Message* proto) {
+	ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
 }
 
-
 void WriteProtoToBinaryFile(const Message& proto, const char* filename);
 inline void WriteProtoToBinaryFile(
-    const Message& proto, const string& filename) {
-  WriteProtoToBinaryFile(proto, filename.c_str());
+	const Message& proto, const string& filename) {
+	WriteProtoToBinaryFile(proto, filename.c_str());
 }
 
 bool ReadFileToDatum(const string& filename, const int label, Datum* datum);
 
 inline bool ReadFileToDatum(const string& filename, Datum* datum) {
-  return ReadFileToDatum(filename, -1, datum);
+	return ReadFileToDatum(filename, -1, datum);
 }
 
 bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum);
+	const int height, const int width, const bool is_color,
+	const std::string & encoding, Datum* datum);
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color, Datum* datum) {
-  return ReadImageToDatum(filename, label, height, width, is_color,
-                          "", datum);
+	const int height, const int width, const bool is_color, Datum* datum) {
+	return ReadImageToDatum(filename, label, height, width, is_color,
+		"", datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, Datum* datum) {
-  return ReadImageToDatum(filename, label, height, width, true, datum);
+	const int height, const int width, Datum* datum) {
+	return ReadImageToDatum(filename, label, height, width, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-    const bool is_color, Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
+	const bool is_color, Datum* datum) {
+	return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-    Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, true, datum);
+	Datum* datum) {
+	return ReadImageToDatum(filename, label, 0, 0, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-    const std::string & encoding, Datum* datum) {
-  return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
+	const std::string & encoding, Datum* datum) {
+	return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
 }
 
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color);
+	const int height, const int width, const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width);
+	const int height, const int width);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color);
+	const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename);
 
@@ -140,19 +139,19 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
 
-template <typename Dtype>
+template<typename Dtype>
 void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+	Blob<Dtype>* blob);
 
-template <typename Dtype>
+template<typename Dtype>
 void hdf5_load_nd_dataset(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+	Blob<Dtype>* blob);
 
-template <typename Dtype>
+template<typename Dtype>
 void hdf5_save_nd_dataset(
-    const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
+	const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index b32760aa..0a7fd67f 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -40,156 +40,157 @@ namespace caffe {
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
-template <typename Dtype>
+template<typename Dtype>
 void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+	Dtype* C);
 
 // Decaf gpu gemm provides an interface that is almost the same as the cpu
 // gemm function - following the c convention and calling the fortran-order
 // gpu code under the hood.
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
-
-template <typename Dtype>
-cl_event caffe_gpu_gemm( cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
-    Dtype* C, const int offC);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+	Dtype* C);
+
+template<typename Dtype>
+cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+	const int offB, const Dtype beta,
+	Dtype* C, const int offC);
 /*This is Yuan Gao's sgemm_ex*/
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C, const int offset1, const int offset2, const int offset3);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+	Dtype* C, const int offset1, const int offset2, const int offset3);
 
-
-template <typename Dtype>
+template<typename Dtype>
 cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, const int offB, const Dtype beta,
-    Dtype* C, const int offC);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+	const int offB, const Dtype beta,
+	Dtype* C, const int offC);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-    Dtype* y);
+	const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+	Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
-    const Dtype * x, size_t offx, const Dtype beta, int incx,
-    Dtype* y, size_t offy, int incy);
+	const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
+	const Dtype * x, size_t offx, const Dtype beta, int incx,
+	Dtype* y, size_t offy, int incy);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-    Dtype* y);
-
+	const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+	Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
+	Dtype* Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
+	Dtype* Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-    const Dtype beta, Dtype* Y);
+	const Dtype beta, Dtype* Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-    const Dtype beta, Dtype* Y);
+	const Dtype beta, Dtype* Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_copy(const int N, const Dtype *X, Dtype *Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
 
 inline void caffe_memset(const size_t N, const int alpha, void* X) {
-  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
+	memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
 }
 
 inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 #ifndef CPU_ONLY
-  ocl_memset((int*)X, (alpha<<24)|(alpha<<16)|(alpha<<8)|alpha, N);
+	ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N);
 #else
-  NO_GPU;
+	NO_GPU;
 #endif
 }
 
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
-void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, Dtype *X);
+template<typename Dtype>
+void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha,
+	Dtype *X);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
 //CUDA version, need to be deleted
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
-void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, const Dtype* b, Dtype* y);
+template<typename Dtype>
+void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a,
+	const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
 //CUDA version, need to be deleted
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-
 unsigned int caffe_rng_rand();
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype caffe_nextafter(const Dtype b);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
@@ -201,54 +202,54 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r);
 // specification of curandGenerateUniform.  With a = 0, b = 1, just calls
 // curandGenerateUniform; with other limits will shift and scale the outputs
 // appropriately after calling curandGenerateUniform.
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                        Dtype* r);
+	Dtype* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                            Dtype* r);
+	Dtype* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
-                                    const Dtype* y);
+	const Dtype* y);
 
 // Returns the sum of the absolute values of the elements of vector x
-template <typename Dtype>
+template<typename Dtype>
 Dtype caffe_cpu_asum(const int n, const Dtype* x);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
 
 // the branchless, type-safe version from
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
 template<typename Dtype>
 inline char caffe_sign(Dtype val) {
-  return (Dtype(0) < val) - (val < Dtype(0));
+	return (Dtype(0) < val) - (val < Dtype(0));
 }
 
 // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
@@ -272,7 +273,6 @@ inline char caffe_sign(Dtype val) {
   template <> \
   void caffe_cpu_##name<double>(const int n, const double* x, double* y)
 
-
 #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
 template<typename Dtype> \
 void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
@@ -301,53 +301,51 @@ void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
 
 DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
-
-template <typename Dtype>
+template<typename Dtype>
 void caffe_abs(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_log(const int n, const Dtype* a, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
-    const Dtype* y, const int incy);
+	const Dtype* y, const int incy);
 }  // namespace caffe
 
-
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 3355b665..e0d4d489 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -81,16 +81,16 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
 // in standard blas. We will simply use a two-step (inefficient, of course) way
 // to mimic that.
 inline void cblas_saxpby(const int N, const float alpha, const float* X,
-                         const int incX, const float beta, float* Y,
-                         const int incY) {
-  cblas_sscal(N, beta, Y, incY);
-  cblas_saxpy(N, alpha, X, incX, Y, incY);
+	const int incX, const float beta, float* Y,
+	const int incY) {
+	cblas_sscal(N, beta, Y, incY);
+	cblas_saxpy(N, alpha, X, incX, Y, incY);
 }
 inline void cblas_daxpby(const int N, const double alpha, const double* X,
-                         const int incX, const double beta, double* Y,
-                         const int incY) {
-  cblas_dscal(N, beta, Y, incY);
-  cblas_daxpy(N, alpha, X, incX, Y, incY);
+	const int incX, const double beta, double* Y,
+	const int incY) {
+	cblas_dscal(N, beta, Y, incY);
+	cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
 
 #endif  // USE_MKL
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 2e56101e..1bd7c8d4 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -29,10 +29,11 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count);
 
-void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count);
+void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
+	const int count);
 
 void eventCallback(cl_event event, cl_int event_status, void * user_data);
 }  // namespace caffe
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index a15b68ff..c4149789 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -31,237 +31,312 @@ namespace caffe {
 
 typedef unsigned int uint32_t;
 
-template <typename dtype> inline std::string get_dtype_suffix()
+template<typename dtype> inline std::string get_dtype_suffix()
 {
-    dtype x;
-    const char type = typeid(x).name()[0];
-    std::string suffix;
-    switch(type){
-        case 'i': suffix = "_int"; break;
-        case 'd': suffix = "_double"; break;
-        case 'f':
-        default: suffix = "_float";
-    }
-    return suffix;
+	dtype x;
+	const char type = typeid(x).name()[0];
+	std::string suffix;
+	switch (type) {
+		case 'i':
+			suffix = "_int";
+			break;
+		case 'd':
+			suffix = "_double";
+			break;
+		case 'f':
+			default:
+			suffix = "_float";
+	}
+	return suffix;
 }
 
-template <typename Dtype>
-void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num);
+template<typename Dtype>
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
+	const int M_, const int packing_num);
 
-template <typename Dtype>
+template<typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum);
+	const int height, const int width, Dtype* data_opt, const int opt_offset,
+	const int optnum);
 
-template <typename Dtype>
-void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data);
+template<typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* bottom_data, Dtype* scale_data);
 
-template <typename Dtype>
+template<typename Dtype>
 void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out);
 
-template <typename Dtype>
-void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data);
+template<typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* scale, Dtype* data);
 
-template <typename Dtype>
-Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
+template<typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
 
-template <typename Dtype>
+template<typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data);
 
-template <typename Dtype>
-void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, const Dtype* label);
-
-template <typename Dtype>
-void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data);
-
-template <typename Dtype>
-void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask);
-
-template <typename Dtype>
-void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
-
-template <typename Dtype>
-void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff);
-
-template <typename Dtype>
- void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff);
-template <typename Dtype>
+template<typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data,
+	const Dtype* label);
+
+template<typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	Dtype* top_data);
+
+template<typename Dtype>
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
+	const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+	Dtype* top_mask);
+
+template<typename Dtype>
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
+	const int* const mask, const Dtype* const top_mask, const int num,
+	const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+	const int pad_w, Dtype* const bottom_diff);
+
+template<typename Dtype>
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
+	const int num, const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+	const int pad_w, Dtype* const bottom_diff);
+
+template<typename Dtype>
+void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
+	const Dtype* const top_diff, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, Dtype* const bottom_diff);
+template<typename Dtype>
 void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
-template <typename Dtype>
-void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff);
+template<typename Dtype>
+void SigmoidBackward(const int count, const Dtype* top_diff,
+	const Dtype* top_data, Dtype* bottom_diff);
 
-template <typename Dtype>
+template<typename Dtype>
 void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
-template <typename Dtype>
-void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff);
-
-template <typename Dtype>
-void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data);
-
-template <typename Dtype>
-void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data);
-
-template <typename Dtype>
-void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data);
-
-template <typename Dtype>
-void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data);
-
-template <typename Dtype>
-void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data);
-
-template <typename Dtype>
-void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff );
-
-template <typename Dtype>
-void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff);
-
-
-template <typename Dtype>
-void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor);
-
-template <typename Dtype> 
-void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor);
-
-template <typename Dtype> 
-void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data,const int offset_in, Dtype* bottom_diff);
-
-template <typename Dtype>
-void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope);
-
-template <typename Dtype>
-void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
-
-template <typename Dtype>
-void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype *top_data);
-
-template <typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff);
-
-template <typename Dtype>
-void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
-
-template <typename Dtype>
-void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y );
-
-template <typename Dtype>
-void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y );
-
-template <typename Dtype>
+template<typename Dtype>
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
+	Dtype* bottom_diff);
+
+template<typename Dtype>
+void ThresholdForward(const int count, const Dtype threshold,
+	const Dtype* bottom_data, Dtype* top_data);
+
+template<typename Dtype>
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, Dtype* top_data);
+
+template<typename Dtype>
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
+	const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, Dtype* top_data);
+
+template<typename Dtype>
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	Dtype* idx_data, Dtype* top_data);
+
+template<typename Dtype>
+void StoPoolForwardTest(const int count, const Dtype* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	Dtype* top_data);
+
+template<typename Dtype>
+void max_pool_bp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, Dtype* bottom_diff);
+
+template<typename Dtype>
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
+	const int clnum, const int channels_, const int intheight_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, const int pad_, Dtype* bottom_diff);
+
+template<typename Dtype>
+void PReLUForward(const int count, const int channels, const int dim,
+	const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+	const int div_factor);
+
+template<typename Dtype>
+void PReLUBackward(const int count, const int channels, const int dim,
+	const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+	const Dtype* slope_data, const int div_factor);
+
+template<typename Dtype>
+void PReLUParamBackward(const int count, const Dtype* top_diff,
+	const int offset_out, const Dtype* bottom_data, const int offset_in,
+	Dtype* bottom_diff);
+
+template<typename Dtype>
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
+	Dtype negative_slope);
+
+template<typename Dtype>
+void ReLUBackward(const int count, const Dtype* top_diff,
+	const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+
+template<typename Dtype>
+void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template<typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data,
+	const int* MaskMem, const Dtype scale_, Dtype *top_data);
+
+template<typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
+	const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+
+template<typename Dtype>
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
+	Dtype threshold);
+
+template<typename Dtype>
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
+
+template<typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
+
+template<typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out);
+	const int spatial_dim, const Dtype* data, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data);
+	const int num, const int channels,
+	const int spatial_dim, const Dtype* channel_max, Dtype* data);
 
-template <typename Dtype>
-void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out);
+template<typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
+	Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_log(const int count, const Dtype* data, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_add_scalar(const int count, const Dtype data, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum);
+	const int spatial_dim, const Dtype* data, Dtype* channel_sum);
 
-template <typename Dtype>
-void kernel_channel_div(const int count, const int num, const int channels, const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+template<typename Dtype>
+void kernel_channel_div(const int count, const int num, const int channels,
+	const int spatial_dim, const Dtype* channel_sum, Dtype* data);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot);
+	const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+	Dtype* channel_dot);
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts);
+	const Dtype* prob_data, const Dtype* label, Dtype* loss,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_,
+	Dtype* counts);
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts);
+	const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+	const int spatial_dim, const bool has_ignore_label_,
+	const int ignore_label_, Dtype* counts);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
 
-template <typename Dtype>
-void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale);
+template<typename Dtype>
+void LRNFillScale(cl_kernel LFSkernel, const int nthreads,
+	const Dtype* const in,
+	const int num, const int channels, const int height,
+	const int width, const int size, const Dtype alpha_over_size,
+	const Dtype k, Dtype* const scale);
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
-     Dtype* scale, Dtype negative_beta, Dtype* out);
+	Dtype* scale, Dtype negative_beta, Dtype* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff);
-template <typename Dtype>
-void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y);
-
-template <typename Dtype>
-void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y);
-
-template <typename Dtype>
-void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
-
-template <typename Dtype>
-void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff);
-
-template <typename Dtype>
-void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data);
-
-template <typename Dtype>
+	const Dtype* const bottom_data, const Dtype* const top_data,
+	const Dtype* const scale, const Dtype* const top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int size, const Dtype negative_beta,
+	const Dtype cache_ratio, Dtype* const bottom_diff);
+template<typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+
+template<typename Dtype>
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template<typename Dtype>
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
+
+template<typename Dtype>
+void BNLLBackward(const int count, const Dtype* top_diff,
+	const Dtype* bottom_data, Dtype *bottom_diff);
+
+template<typename Dtype>
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+	const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, Dtype *out_data);
+
+template<typename Dtype>
 void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff);
+	const Dtype margin, const bool legacy_version, const Dtype alpha,
+	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+	Dtype *bottom_diff);
 
-template <typename Dtype>
+template<typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask);
+	const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+	int* mask);
 
-template <typename Dtype>
+template<typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff);
+	const int blob_idx, const int* mask, Dtype* bottom_diff);
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
-  // namespace caffe
+// namespace caffe
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index 8f1cf0d1..b59d9a67 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -14,29 +14,30 @@ namespace caffe {
 typedef boost::mt19937 rng_t;
 
 inline rng_t* caffe_rng() {
-  return static_cast<caffe::rng_t*>(Caffe::rng_stream().generator());
+	return static_cast<caffe::rng_t*>(Caffe::rng_stream().generator());
 }
 
 // Fisher–Yates algorithm
-template <class RandomAccessIterator, class RandomGenerator>
+template<class RandomAccessIterator, class RandomGenerator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
-                    RandomGenerator* gen) {
-  typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
-      difference_type;
-  typedef typename boost::uniform_int<difference_type> dist_type;
-
-  difference_type length = std::distance(begin, end);
-  if (length <= 0) return;
-
-  for (difference_type i = length - 1; i > 0; --i) {
-    dist_type dist(0, i);
-    std::iter_swap(begin + i, begin + dist(*gen));
-  }
+	RandomGenerator* gen) {
+	typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
+	difference_type;
+	typedef typename boost::uniform_int<difference_type> dist_type;
+
+	difference_type length = std::distance(begin, end);
+	if (length <= 0)
+		return;
+
+	for (difference_type i = length - 1; i > 0; --i) {
+		dist_type dist(0, i);
+		std::iter_swap(begin + i, begin + dist(*gen));
+	}
 }
 
-template <class RandomAccessIterator>
+template<class RandomAccessIterator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) {
-  shuffle(begin, end, caffe_rng());
+	shuffle(begin, end, caffe_rng());
 }
 }  // namespace caffe
 
diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index c1f21a0d..d140e029 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param);
 // taking its top blob as input.
 // Error if any of these above layers are not-conv layers.
 void UpgradeV0PaddingLayers(const NetParameter& param,
-                            NetParameter* param_upgraded_pad);
+	NetParameter* param_upgraded_pad);
 
 // Upgrade a single V0LayerConnection to the V1LayerParameter format.
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-                             V1LayerParameter* layer_param);
+	V1LayerParameter* layer_param);
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type);
 
@@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param);
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param);
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-                             LayerParameter* layer_param);
+	LayerParameter* layer_param);
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type);
 
@@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
 
 // Read parameters from a file into a NetParameter proto message.
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param);
+	NetParameter* param);
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param);
+	NetParameter* param);
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 3ee5a779..9b718bd8 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -20,135 +20,150 @@ namespace caffe {
  * @brief Abstract base class that factors out the BLAS code common to
  *        ConvolutionLayer and DeconvolutionLayer.
  */
-template <typename Dtype>
-class BaseConvolutionLayer : public Layer<Dtype> {
- public:
-  explicit BaseConvolutionLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual  ~BaseConvolutionLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int MinBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
-
- protected:
-  // Helper functions that abstract away the column buffer and gemm arguments.
-  // The last argument in forward_cpu_gemm is so that we can skip the im2col if
-  // we just called weight_cpu_gemm with the same input.
-  void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_cpu_bias(Dtype* output, const Dtype* bias);
-  void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* output);
-  void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
-      weights);
-  void backward_cpu_bias(Dtype* bias, const Dtype* input);
-//opencl related setup
-  void ocl_setup();
+template<typename Dtype>
+class BaseConvolutionLayer: public Layer<Dtype> {
+	public:
+		explicit BaseConvolutionLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual ~BaseConvolutionLayer();
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline int MinBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+		virtual inline bool EqualNumBottomTopBlobs() const {
+			return true;
+		}
+
+	protected:
+		// Helper functions that abstract away the column buffer and gemm arguments.
+		// The last argument in forward_cpu_gemm is so that we can skip the im2col if
+		// we just called weight_cpu_gemm with the same input.
+		void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
+			Dtype* output, bool skip_im2col = false);
+		void forward_cpu_bias(Dtype* output, const Dtype* bias);
+		void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
+			Dtype* output);
+		void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
+			weights);
+		void backward_cpu_bias(Dtype* bias, const Dtype* input);
+		//opencl related setup
+		void ocl_setup();
 
 #ifndef CPU_ONLY
-  void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_gpu_bias(Dtype* output, const Dtype* bias);
-  void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
-  void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* col_output);
-  void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
-      Dtype* col_output);
-  void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
-      weights);
-  void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype*
-      weights);
-  void backward_gpu_bias(Dtype* bias, const Dtype* input);
-#endif
-
-  // reverse_dimensions should return true iff we are implementing deconv, so
-  // that conv helpers know which dimensions are which.
-  virtual bool reverse_dimensions() = 0;
-  // Compute height_out_ and width_out_ from other parameters.
-  virtual void compute_output_shape() = 0;
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int num_;
-  int channels_;
-  int pad_h_, pad_w_;
-  int height_, width_;
-  int group_;
-  int num_output_;
-  int height_out_, width_out_;
-  bool bias_term_;
-  bool is_1x1_;
-
- private:
-  // wrap im2col/col2im so we don't have to remember the (long) argument lists
-  inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
-    im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
-  }
-  inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
-    col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
-  }
+		void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
+			Dtype* output, bool skip_im2col = false);
+		void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
+			Dtype* output, bool skip_im2col = false);
+		void forward_gpu_bias(Dtype* output, const Dtype* bias);
+		void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
+		void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
+			Dtype* col_output);
+		void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
+			Dtype* col_output);
+		void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
+			weights);
+		void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype*
+			weights);
+		void backward_gpu_bias(Dtype* bias, const Dtype* input);
+		#endif
+
+		// reverse_dimensions should return true iff we are implementing deconv, so
+		// that conv helpers know which dimensions are which.
+		virtual bool reverse_dimensions() = 0;
+		// Compute height_out_ and width_out_ from other parameters.
+		virtual void compute_output_shape() = 0;
+
+		int kernel_h_, kernel_w_;
+		int stride_h_, stride_w_;
+		int num_;
+		int channels_;
+		int pad_h_, pad_w_;
+		int height_, width_;
+		int group_;
+		int num_output_;
+		int height_out_, width_out_;
+		bool bias_term_;
+		bool is_1x1_;
+
+	private:
+		// wrap im2col/col2im so we don't have to remember the (long) argument lists
+		inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
+			im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+		}
+		inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
+			col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+		}
 #ifndef CPU_ONLY
-  inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-     im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
-           kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff, 0);
-  }
-  inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col_buff, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data, bottom_offset_);
-  }
- protected:
-  inline void conv_im2col_gpu_opt(const Dtype* data) {
-     im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, conv_in_width_,
-           kernel_w_, pad_w_, stride_h_,(Dtype*)transMem, 0, opt_num2);
-  }
-  inline void conv_col2im_gpu_opt( Dtype* data) {
-    col2im_gpu_opt((Dtype*)transMem, 0,  conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
-}
- private:
-  inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
-    transform_gpu((Dtype*)temp_buffer, top_data, top_offset_, N_, M_*opt_num2, opt_num2);
-}
- inline void conv_transpose_gpu(const Dtype* data){
-    opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
-}
-protected:
-  inline void gpu_memset(Dtype* data, Dtype value, int count) {
-    ocl_memset(data, value, count);
-}
+		inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
+			im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+				conv_in_width_,
+				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff,
+				0);
+		}
+		inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
+			col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_,
+				conv_in_width_,
+				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data,
+				bottom_offset_);
+		}
+	protected:
+		inline void conv_im2col_gpu_opt(const Dtype* data) {
+			im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+				conv_in_width_,
+				kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2);
+		}
+		inline void conv_col2im_gpu_opt(Dtype* data) {
+			col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
+				conv_in_width_,
+				kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
+		}
+	private:
+		inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
+			transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_,
+				M_ * opt_num2, opt_num2);
+		}
+		inline void conv_transpose_gpu(const Dtype* data) {
+			opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+				opt_num2);
+		}
+	protected:
+		inline void gpu_memset(Dtype* data, Dtype value, int count) {
+			ocl_memset(data, value, count);
+		}
 #endif
 
-private:
-  int conv_out_channels_;
-  int conv_in_channels_;
-  int conv_out_spatial_dim_;
-  int conv_in_height_;
-  int conv_in_width_;
-  int kernel_dim_;
+	private:
+		int conv_out_channels_;
+		int conv_in_channels_;
+		int conv_out_spatial_dim_;
+		int conv_in_height_;
+		int conv_in_width_;
+		int kernel_dim_;
 
-  Blob<Dtype> col_buffer_;
-  Blob<Dtype> bias_multiplier_;
+		Blob<Dtype> col_buffer_;
+		Blob<Dtype> bias_multiplier_;
 
 //opencl related data structures
-protected:
-  int opt_num2;
-  int M_, N_, K_;
-  int weight_offset_;
-  int col_offset_;
-  int output_offset_;
-  int top_offset_, top_offset_opt, bottom_offset_;
-public:
-  static cl_mem subTopMem, transMem;
-  static size_t subtop_mem_size, trans_mem_size;
+	protected:
+		int opt_num2;
+		int M_, N_, K_;
+		int weight_offset_;
+		int col_offset_;
+		int output_offset_;
+		int top_offset_, top_offset_opt, bottom_offset_;
+		public:
+		static cl_mem subTopMem, transMem;
+		static size_t subtop_mem_size, trans_mem_size;
 };
 
 /**
@@ -167,62 +182,67 @@ class BaseConvolutionLayer : public Layer<Dtype> {
  *   be filtered. col2im restores the output spatial structure by rolling up
  *   the output channel N' columns of the output matrix.
  */
-template <typename Dtype>
-class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
- public:
-  /**
-   * @param param provides ConvolutionParameter convolution_param,
-   *    with ConvolutionLayer options:
-   *  - num_output. The number of filters.
-   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
-   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
-   *  filters.
-   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
-   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
-   *  for different strides. By default the convolution is dense with stride 1.
-   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
-   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
-   *  different padding. Input padding is computed implicitly instead of
-   *  actually padding.
-   *  - group (\b optional, default 1). The number of filter groups. Group
-   *  convolution is a method for reducing parameterization by selectively
-   *  connecting input and output channels. The input and output channel dimensions must be divisible
-   *  by the number of groups. For group @f$ \geq 1 @f$, the
-   *  convolutional filters' input and output channels are separated s.t. each
-   *  group takes 1 / group of the input channels and makes 1 / group of the
-   *  output channels. Concretely 4 input channels, 8 output channels, and
-   *  2 groups separate input channels 1-2 and output channels 1-4 into the
-   *  first group and input channels 3-4 and output channels 5-8 into the second
-   *  group.
-   *  - bias_term (\b optional, default true). Whether to have a bias.
-   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
-   *    kernels + stream parallelism) engines.
-   */
-  explicit ConvolutionLayer(const LayerParameter& param)
-      : BaseConvolutionLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Convolution"; }
-
-protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual inline bool reverse_dimensions() { return false; }
-  virtual void compute_output_shape();
-  
-  virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+template<typename Dtype>
+class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides ConvolutionParameter convolution_param,
+		 *    with ConvolutionLayer options:
+		 *  - num_output. The number of filters.
+		 *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+		 *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+		 *  filters.
+		 *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+		 *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+		 *  for different strides. By default the convolution is dense with stride 1.
+		 *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+		 *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+		 *  different padding. Input padding is computed implicitly instead of
+		 *  actually padding.
+		 *  - group (\b optional, default 1). The number of filter groups. Group
+		 *  convolution is a method for reducing parameterization by selectively
+		 *  connecting input and output channels. The input and output channel dimensions must be divisible
+		 *  by the number of groups. For group @f$ \geq 1 @f$, the
+		 *  convolutional filters' input and output channels are separated s.t. each
+		 *  group takes 1 / group of the input channels and makes 1 / group of the
+		 *  output channels. Concretely 4 input channels, 8 output channels, and
+		 *  2 groups separate input channels 1-2 and output channels 1-4 into the
+		 *  first group and input channels 3-4 and output channels 5-8 into the second
+		 *  group.
+		 *  - bias_term (\b optional, default true). Whether to have a bias.
+		 *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+		 *    kernels + stream parallelism) engines.
+		 */
+		explicit ConvolutionLayer(const LayerParameter& param)
+			: BaseConvolutionLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "Convolution";
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual inline bool reverse_dimensions() {
+			return false;
+		}
+		virtual void compute_output_shape();
+
+		virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -239,25 +259,30 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
  *   padding is removed from the output rather than added to the input, and
  *   stride results in upsampling rather than downsampling).
  */
-template <typename Dtype>
-class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
- public:
-  explicit DeconvolutionLayer(const LayerParameter& param)
-      : BaseConvolutionLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Deconvolution"; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual inline bool reverse_dimensions() { return true; }
-  virtual void compute_output_shape();
+template<typename Dtype>
+class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
+	public:
+		explicit DeconvolutionLayer(const LayerParameter& param)
+			: BaseConvolutionLayer<Dtype>(param) {
+		}
+
+		virtual inline const char* type() const {
+			return "Deconvolution";
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual inline bool reverse_dimensions() {
+			return true;
+		}
+		virtual void compute_output_shape();
 };
 
 #ifdef USE_CUDNN
@@ -274,34 +299,34 @@ class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
  * input and filter regimes the CUDNN engine is faster than the CAFFE engine,
  * but for fully-convolutional models and large inputs the CAFFE engine can be
  * faster as long as it fits in memory.
-*/
+ */
 template <typename Dtype>
 class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
- public:
-  explicit CuDNNConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNConvolutionLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t* handle_;
-  cudaStream_t*  stream_;
-  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
-  cudnnTensorDescriptor_t    bias_desc_;
-  cudnnFilterDescriptor_t      filter_desc_;
-  vector<cudnnConvolutionDescriptor_t> conv_descs_;
-  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
-  size_t workspaceSizeInBytes;
-  void *workspace;
+	public:
+	explicit CuDNNConvolutionLayer(const LayerParameter& param)
+	: ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNConvolutionLayer();
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t* handle_;
+	cudaStream_t* stream_;
+	vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+	cudnnTensorDescriptor_t bias_desc_;
+	cudnnFilterDescriptor_t filter_desc_;
+	vector<cudnnConvolutionDescriptor_t> conv_descs_;
+	int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
+	size_t workspaceSizeInBytes;
+	void *workspace;
 };
 #endif
 
@@ -312,163 +337,183 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class Im2colLayer : public Layer<Dtype> {
- public:
-  explicit Im2colLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Im2col"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int channels_;
-  int height_, width_;
-  int pad_h_, pad_w_;
+template<typename Dtype>
+class Im2colLayer: public Layer<Dtype> {
+	public:
+		explicit Im2colLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Im2col";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int kernel_h_, kernel_w_;
+		int stride_h_, stride_w_;
+		int channels_;
+		int height_, width_;
+		int pad_h_, pad_w_;
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
-template <typename Dtype> class PoolingLayer;
-template <typename Dtype> class SplitLayer;
+template<typename Dtype> class PoolingLayer;
+template<typename Dtype> class SplitLayer;
 
 /**
  * @brief Normalize the input in a local region across or within feature maps.
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class LRNLayer : public Layer<Dtype> {
- public:
-  explicit LRNLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "LRN"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int size_;
-  int pre_pad_;
-  Dtype alpha_;
-  Dtype beta_;
-  Dtype k_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-
-  // Fields used for normalization ACROSS_CHANNELS
-  // scale_ stores the intermediate summing results
-  Blob<Dtype> scale_;
-
-  // Fields used for normalization WITHIN_CHANNEL
-  shared_ptr<SplitLayer<Dtype> > split_layer_;
-  vector<Blob<Dtype>*> split_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > square_layer_;
-  Blob<Dtype> square_input_;
-  Blob<Dtype> square_output_;
-  vector<Blob<Dtype>*> square_bottom_vec_;
-  vector<Blob<Dtype>*> square_top_vec_;
-  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
-  Blob<Dtype> pool_output_;
-  vector<Blob<Dtype>*> pool_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > power_layer_;
-  Blob<Dtype> power_output_;
-  vector<Blob<Dtype>*> power_top_vec_;
-  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
-  Blob<Dtype> product_input_;
-  vector<Blob<Dtype>*> product_bottom_vec_;
-
-  cl_kernel LFSkernel, LCDkernel, LCOkernel;
+template<typename Dtype>
+class LRNLayer: public Layer<Dtype> {
+	public:
+		explicit LRNLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "LRN";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int size_;
+		int pre_pad_;
+		Dtype alpha_;
+		Dtype beta_;
+		Dtype k_;
+		int num_;
+		int channels_;
+		int height_;
+		int width_;
+
+		// Fields used for normalization ACROSS_CHANNELS
+		// scale_ stores the intermediate summing results
+		Blob<Dtype> scale_;
+
+		// Fields used for normalization WITHIN_CHANNEL
+		shared_ptr<SplitLayer<Dtype> > split_layer_;
+		vector<Blob<Dtype>*> split_top_vec_;
+		shared_ptr<PowerLayer<Dtype> > square_layer_;
+		Blob<Dtype> square_input_;
+		Blob<Dtype> square_output_;
+		vector<Blob<Dtype>*> square_bottom_vec_;
+		vector<Blob<Dtype>*> square_top_vec_;
+		shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+		Blob<Dtype> pool_output_;
+		vector<Blob<Dtype>*> pool_top_vec_;
+		shared_ptr<PowerLayer<Dtype> > power_layer_;
+		Blob<Dtype> power_output_;
+		vector<Blob<Dtype>*> power_top_vec_;
+		shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+		Blob<Dtype> product_input_;
+		vector<Blob<Dtype>*> product_bottom_vec_;
+
+		cl_kernel LFSkernel, LCDkernel, LCOkernel;
 };
 
-
 /*n
  * @brief Pools the input image by taking the max, average, etc. within regions.
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template <typename Dtype>
-class PoolingLayer : public Layer<Dtype> {
- public:
-  explicit PoolingLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Pooling"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  // MAX POOL layers can output an extra top blob for the mask;
-  // others can only output the pooled inputs.
-  virtual inline int MaxTopBlobs() const {
-    return (this->layer_param_.pooling_param().pool() ==
-            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-  }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int pad_h_, pad_w_;
-  int channels_;
-  int height_, width_;
-  int pooled_height_, pooled_width_;
-  bool global_pooling_;
-  Blob<Dtype> rand_idx_;
-  Blob<int> max_idx_;
+template<typename Dtype>
+class PoolingLayer: public Layer<Dtype> {
+	public:
+		explicit PoolingLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Pooling";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+		// MAX POOL layers can output an extra top blob for the mask;
+		// others can only output the pooled inputs.
+		virtual inline int MaxTopBlobs() const {
+			return (this->layer_param_.pooling_param().pool() ==
+				PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int kernel_h_, kernel_w_;
+		int stride_h_, stride_w_;
+		int pad_h_, pad_w_;
+		int channels_;
+		int height_, width_;
+		int pooled_height_, pooled_width_;
+		bool global_pooling_;
+		Blob<Dtype> rand_idx_;
+		Blob<int> max_idx_;
 
 };
 
@@ -476,32 +521,32 @@ class PoolingLayer : public Layer<Dtype> {
 /*
  * @brief cuDNN implementation of PoolingLayer.
  *        Fallback to PoolingLayer for CPU mode.
-*/
+ */
 template <typename Dtype>
 class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
- public:
-  explicit CuDNNPoolingLayer(const LayerParameter& param)
-      : PoolingLayer<Dtype>(param), handles_setup_(false) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~CuDNNPoolingLayer();
-  // Currently, cuDNN does not support the extra top blob.
-  virtual inline int MinTopBlobs() const { return -1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool handles_setup_;
-  cudnnHandle_t             handle_;
-  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-  cudnnPoolingDescriptor_t  pooling_desc_;
-  cudnnPoolingMode_t        mode_;
+	public:
+	explicit CuDNNPoolingLayer(const LayerParameter& param)
+	: PoolingLayer<Dtype>(param), handles_setup_(false) {}
+	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual ~CuDNNPoolingLayer();
+	// Currently, cuDNN does not support the extra top blob.
+	virtual inline int MinTopBlobs() const {return -1;}
+	virtual inline int ExactNumTopBlobs() const {return 1;}
+
+	protected:
+	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top);
+	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+	bool handles_setup_;
+	cudnnHandle_t handle_;
+	cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+	cudnnPoolingDescriptor_t pooling_desc_;
+	cudnnPoolingMode_t mode_;
 };
 #endif
 
@@ -511,64 +556,71 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
  *        so that the result vector of different sized
  *        images are of the same size.
  */
-template <typename Dtype>
-class SPPLayer : public Layer<Dtype> {
- public:
-  explicit SPPLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SPP"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  // MAX POOL layers can output an extra top blob for the mask;
-  // others can only output the pooled inputs.
-  virtual inline int MaxTopBlobs() const {
-    return (this->layer_param_.pooling_param().pool() ==
-            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-  }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  // calculates the kernel and stride dimensions for the pooling layer,
-  // returns a correctly configured LayerParameter for a PoolingLayer
-  virtual LayerParameter GetPoolingParam(const int pyramid_level,
-      const int bottom_h, const int bottom_w, const SPPParameter spp_param);
-
-  int pyramid_height_;
-  int bottom_h_, bottom_w_;
-  int channels_;
-  int kernel_h_, kernel_w_;
-  int pad_h_, pad_w_;
-
-  /// the internal Split layer that feeds the pooling layers
-  shared_ptr<SplitLayer<Dtype> > split_layer_;
-  /// top vector holder used in call to the underlying SplitLayer::Forward
-  vector<Blob<Dtype>*> split_top_vec_;
-  /// bottom vector holder used in call to the underlying PoolingLayer::Forward
-  vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
-  /// the internal Pooling layers of different kernel sizes
-  vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
-  /// top vector holders used in call to the underlying PoolingLayer::Forward
-  vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
-  /// pooling_outputs stores the outputs of the PoolingLayers
-  vector<Blob<Dtype>*> pooling_outputs_;
-  /// the internal Flatten layers that the Pooling layers feed into
-  vector<FlattenLayer<Dtype>*> flatten_layers_;
-  /// top vector holders used in call to the underlying FlattenLayer::Forward
-  vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
-  /// flatten_outputs stores the outputs of the FlattenLayers
-  vector<Blob<Dtype>*> flatten_outputs_;
-  /// bottom vector holder used in call to the underlying ConcatLayer::Forward
-  vector<Blob<Dtype>*> concat_bottom_vec_;
-  /// the internal Concat layers that the Flatten layers feed into
-  shared_ptr<ConcatLayer<Dtype> > concat_layer_;
+template<typename Dtype>
+class SPPLayer: public Layer<Dtype> {
+	public:
+		explicit SPPLayer(const LayerParameter& param)
+			: Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "SPP";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int MinTopBlobs() const {
+			return 1;
+		}
+		// MAX POOL layers can output an extra top blob for the mask;
+		// others can only output the pooled inputs.
+		virtual inline int MaxTopBlobs() const {
+			return (this->layer_param_.pooling_param().pool() ==
+				PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		// calculates the kernel and stride dimensions for the pooling layer,
+		// returns a correctly configured LayerParameter for a PoolingLayer
+		virtual LayerParameter GetPoolingParam(const int pyramid_level,
+			const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+
+		int pyramid_height_;
+		int bottom_h_, bottom_w_;
+		int channels_;
+		int kernel_h_, kernel_w_;
+		int pad_h_, pad_w_;
+
+		/// the internal Split layer that feeds the pooling layers
+		shared_ptr<SplitLayer<Dtype> > split_layer_;
+		/// top vector holder used in call to the underlying SplitLayer::Forward
+		vector<Blob<Dtype>*> split_top_vec_;
+		/// bottom vector holder used in call to the underlying PoolingLayer::Forward
+		vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
+		/// the internal Pooling layers of different kernel sizes
+		vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
+		/// top vector holders used in call to the underlying PoolingLayer::Forward
+		vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
+		/// pooling_outputs stores the outputs of the PoolingLayers
+		vector<Blob<Dtype>*> pooling_outputs_;
+		/// the internal Flatten layers that the Pooling layers feed into
+		vector<FlattenLayer<Dtype>*> flatten_layers_;
+		/// top vector holders used in call to the underlying FlattenLayer::Forward
+		vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
+		/// flatten_outputs stores the outputs of the FlattenLayers
+		vector<Blob<Dtype>*> flatten_outputs_;
+		/// bottom vector holder used in call to the underlying ConcatLayer::Forward
+		vector<Blob<Dtype>*> concat_bottom_vec_;
+		/// the internal Concat layers that the Flatten layers feed into
+		shared_ptr<ConcatLayer<Dtype> > concat_layer_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 4cec89ae..e7d129bb 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -8,494 +8,510 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
-    const int width) {
-  vector<int> shape(4);
-  shape[0] = num;
-  shape[1] = channels;
-  shape[2] = height;
-  shape[3] = width;
-  Reshape(shape);
+	const int width) {
+	vector<int> shape(4);
+	shape[0] = num;
+	shape[1] = channels;
+	shape[2] = height;
+	shape[3] = width;
+	Reshape(shape);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::Reshape(const vector<int>& shape) {
-  CHECK_LE(shape.size(), kMaxBlobAxes);
-  count_ = 1;
-  shape_.resize(shape.size());
-  for (int i = 0; i < shape.size(); ++i) {
-    CHECK_GE(shape[i], 0);
-    CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
-    count_ *= shape[i];
-    shape_[i] = shape[i];
-  }
-  if (count_ > capacity_) {
-    capacity_ = count_;
-    data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
-    diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
-  }
-}
-
-template <typename Dtype>
+	CHECK_LE(shape.size(), kMaxBlobAxes);
+	count_ = 1;
+	shape_.resize(shape.size());
+	for (int i = 0; i < shape.size(); ++i) {
+		CHECK_GE(shape[i], 0);
+		CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+		count_ *= shape[i];
+		shape_[i] = shape[i];
+	}
+	if (count_ > capacity_) {
+		capacity_ = count_;
+		data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
+		diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
+	}
+}
+
+template<typename Dtype>
 void Blob<Dtype>::Reshape(const BlobShape& shape) {
-  CHECK_LE(shape.dim_size(), kMaxBlobAxes);
-  vector<int> shape_vec(shape.dim_size());
-  for (int i = 0; i < shape.dim_size(); ++i) {
-    shape_vec[i] = shape.dim(i);
-  }
-  Reshape(shape_vec);
+	CHECK_LE(shape.dim_size(), kMaxBlobAxes);
+	vector<int> shape_vec(shape.dim_size());
+	for (int i = 0; i < shape.dim_size(); ++i) {
+		shape_vec[i] = shape.dim(i);
+	}
+	Reshape(shape_vec);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
-  Reshape(other.shape());
+	Reshape(other.shape());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Blob<Dtype>::Blob(const int num, const int channels, const int height,
-    const int width)
-  // capacity_ must be initialized before calling Reshape
-  : capacity_(0) {
-  Reshape(num, channels, height, width);
+	const int width)
+	// capacity_ must be initialized before calling Reshape
+	: capacity_(0) {
+	Reshape(num, channels, height, width);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Blob<Dtype>::Blob(const vector<int>& shape)
-  // capacity_ must be initialized before calling Reshape
-  : capacity_(0) {
-  Reshape(shape);
+	// capacity_ must be initialized before calling Reshape
+	: capacity_(0) {
+	Reshape(shape);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
-  CHECK(data_);
-  return (const Dtype*)data_->cpu_data();
+	CHECK (data_);
+	return (const Dtype*) data_->cpu_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
-  CHECK(data);
-  data_->set_cpu_data(data);
+	CHECK(data);
+	data_->set_cpu_data(data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const Dtype* Blob<Dtype>::gpu_data() const {
-  CHECK(data_);
-  return (const Dtype*)data_->gpu_data();
+	CHECK (data_);
+	return (const Dtype*) data_->gpu_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const Dtype* Blob<Dtype>::gpu_cache_data() const {
-  CHECK(data_);
-  return (const Dtype*)data_->gpu_cache_data();
+	CHECK (data_);
+	return (const Dtype*) data_->gpu_cache_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
-  CHECK(diff_);
-  return (const Dtype*)diff_->cpu_data();
+	CHECK (diff_);
+	return (const Dtype*) diff_->cpu_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const Dtype* Blob<Dtype>::gpu_diff() const {
-  CHECK(diff_);
-  return (const Dtype*)diff_->gpu_data();
+	CHECK (diff_);
+	return (const Dtype*) diff_->gpu_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_data() {
-  CHECK(data_);
-  return static_cast<Dtype*>(data_->mutable_cpu_data());
+	CHECK (data_);
+	return static_cast<Dtype*>(data_->mutable_cpu_data());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_data() {
-  CHECK(data_);
-  return static_cast<Dtype*>(data_->mutable_gpu_data());
+	CHECK (data_);
+	return static_cast<Dtype*>(data_->mutable_gpu_data());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_diff() {
-  CHECK(diff_);
-  return static_cast<Dtype*>(diff_->mutable_cpu_data());
+	CHECK (diff_);
+	return static_cast<Dtype*>(diff_->mutable_cpu_data());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_diff() {
-  CHECK(diff_);
-  return static_cast<Dtype*>(diff_->mutable_gpu_data());
+	CHECK (diff_);
+	return static_cast<Dtype*>(diff_->mutable_gpu_data());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::ShareData(const Blob& other) {
-  CHECK_EQ(count_, other.count());
-  data_ = other.data();
+	CHECK_EQ(count_, other.count());
+	data_ = other.data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::ShareDiff(const Blob& other) {
-  CHECK_EQ(count_, other.count());
-  diff_ = other.diff();
+	CHECK_EQ(count_, other.count());
+	diff_ = other.diff();
 }
 
 // The "update" method is used for parameter blobs in a Net, which are stored
 // as Blob<float> or Blob<double> -- hence we do not define it for
 // Blob<int> or Blob<unsigned int>.
-template <> void Blob<unsigned int>::Update() { NOT_IMPLEMENTED; }
-template <> void Blob<int>::Update() { NOT_IMPLEMENTED; }
+template<> void Blob<unsigned int>::Update() {
+	NOT_IMPLEMENTED;
+}
+template<> void Blob<int>::Update() {
+	NOT_IMPLEMENTED;
+}
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::Update() {
-  // We will perform update based on where the data is located.
-  switch (data_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    // perform computation on CPU
-    caffe_axpy<Dtype>(count_, Dtype(-1),
-        static_cast<const Dtype*>(diff_->cpu_data()),
-        static_cast<Dtype*>(data_->mutable_cpu_data()));
-    break;
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    // perform computation on GPU
-    caffe_gpu_axpy<Dtype>(count_, Dtype(-1),
-        static_cast<const Dtype*>(diff_->gpu_data()),
-        static_cast<Dtype*>(data_->mutable_gpu_data()));
+	// We will perform update based on where the data is located.
+	switch (data_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			// perform computation on CPU
+			caffe_axpy < Dtype > (count_, Dtype(-1),
+				static_cast<const Dtype*>(diff_->cpu_data()),
+				static_cast<Dtype*>(data_->mutable_cpu_data()));
+			break;
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+			// perform computation on GPU
+			caffe_gpu_axpy < Dtype > (count_, Dtype(-1),
+				static_cast<const Dtype*>(diff_->gpu_data()),
+				static_cast<Dtype*>(data_->mutable_gpu_data()));
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  default:
-    LOG(FATAL) << "Syncedmem not initialized.";
-  }
+			break;
+		default:
+			LOG(FATAL) << "Syncedmem not initialized.";
+	}
 }
 
-template <> unsigned int Blob<unsigned int>::asum_data() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> unsigned int Blob<unsigned int>::asum_data() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <> int Blob<int>::asum_data() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> int Blob<int>::asum_data() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype Blob<Dtype>::asum_data() const {
-  if (!data_) { return 0; }
-  switch (data_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    return caffe_cpu_asum(count_, cpu_data());
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-  {
-    Dtype asum;
-    caffe_gpu_asum(count_, gpu_data(), &asum);
-    return asum;
-  }
+	if (!data_) {
+		return 0;
+	}
+	switch (data_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			return caffe_cpu_asum(count_, cpu_data());
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+		{
+			Dtype asum;
+			caffe_gpu_asum(count_, gpu_data(), &asum);
+			return asum;
+		}
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-  case SyncedMemory::UNINITIALIZED:
-    return 0;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-  }
-  return 0;
+		case SyncedMemory::UNINITIALIZED:
+			return 0;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+	}
+	return 0;
 }
 
-template <> unsigned int Blob<unsigned int>::asum_diff() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> unsigned int Blob<unsigned int>::asum_diff() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <> int Blob<int>::asum_diff() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> int Blob<int>::asum_diff() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype Blob<Dtype>::asum_diff() const {
-  if (!diff_) { return 0; }
-  switch (diff_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    return caffe_cpu_asum(count_, cpu_diff());
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-  {
-    Dtype asum;
-    caffe_gpu_asum(count_, gpu_diff(), &asum);
-    return asum;
-  }
+	if (!diff_) {
+		return 0;
+	}
+	switch (diff_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			return caffe_cpu_asum(count_, cpu_diff());
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+		{
+			Dtype asum;
+			caffe_gpu_asum(count_, gpu_diff(), &asum);
+			return asum;
+		}
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-  case SyncedMemory::UNINITIALIZED:
-    return 0;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
-  }
-  return 0;
+		case SyncedMemory::UNINITIALIZED:
+			return 0;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
+	}
+	return 0;
 }
 
-template <> unsigned int Blob<unsigned int>::sumsq_data() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> unsigned int Blob<unsigned int>::sumsq_data() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <> int Blob<int>::sumsq_data() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> int Blob<int>::sumsq_data() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype Blob<Dtype>::sumsq_data() const {
-  Dtype sumsq;
-  const Dtype* data;
-  if (!data_) { return 0; }
-  switch (data_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    data = cpu_data();
-    sumsq = caffe_cpu_dot(count_, data, data);
-    break;
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    data = gpu_data();
-    caffe_gpu_dot(count_, data, data, &sumsq);
+	Dtype sumsq;
+	const Dtype* data;
+	if (!data_) {
+		return 0;
+	}
+	switch (data_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			data = cpu_data();
+			sumsq = caffe_cpu_dot(count_, data, data);
+			break;
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+			data = gpu_data();
+			caffe_gpu_dot(count_, data, data, &sumsq);
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  case SyncedMemory::UNINITIALIZED:
-    return 0;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-  }
-  return sumsq;
+			break;
+		case SyncedMemory::UNINITIALIZED:
+			return 0;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+	}
+	return sumsq;
 }
 
-template <> unsigned int Blob<unsigned int>::sumsq_diff() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> unsigned int Blob<unsigned int>::sumsq_diff() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <> int Blob<int>::sumsq_diff() const {
-  NOT_IMPLEMENTED;
-  return 0;
+template<> int Blob<int>::sumsq_diff() const {
+	NOT_IMPLEMENTED;
+	return 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype Blob<Dtype>::sumsq_diff() const {
-  Dtype sumsq;
-  const Dtype* diff;
-  if (!diff_) { return 0; }
-  switch (diff_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    diff = cpu_diff();
-    sumsq = caffe_cpu_dot(count_, diff, diff);
-    break;
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    diff = gpu_diff();
-    caffe_gpu_dot(count_, diff, diff, &sumsq);
-    break;
+	Dtype sumsq;
+	const Dtype* diff;
+	if (!diff_) {
+		return 0;
+	}
+	switch (diff_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			diff = cpu_diff();
+			sumsq = caffe_cpu_dot(count_, diff, diff);
+			break;
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+			diff = gpu_diff();
+			caffe_gpu_dot(count_, diff, diff, &sumsq);
+			break;
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-  case SyncedMemory::UNINITIALIZED:
-    return 0;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-  }
-  return sumsq;
+		case SyncedMemory::UNINITIALIZED:
+			return 0;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+	}
+	return sumsq;
 }
 
-template <> void Blob<unsigned int>::scale_data(unsigned int scale_factor) {
-  NOT_IMPLEMENTED;
+template<> void Blob<unsigned int>::scale_data(unsigned int scale_factor) {
+	NOT_IMPLEMENTED;
 }
 
-template <> void Blob<int>::scale_data(int scale_factor) {
-  NOT_IMPLEMENTED;
+template<> void Blob<int>::scale_data(int scale_factor) {
+	NOT_IMPLEMENTED;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::scale_data(Dtype scale_factor) {
-  Dtype* data;
-  if (!data_) { return; }
-  switch (data_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    data = mutable_cpu_data();
-    caffe_scal(count_, scale_factor, data);
-    return;
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    data = mutable_gpu_data();
-    caffe_gpu_scal(count_, scale_factor, data);
-    return;
+	Dtype* data;
+	if (!data_) {
+		return;
+	}
+	switch (data_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			data = mutable_cpu_data();
+			caffe_scal(count_, scale_factor, data);
+			return;
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+			data = mutable_gpu_data();
+			caffe_gpu_scal(count_, scale_factor, data);
+			return;
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-  case SyncedMemory::UNINITIALIZED:
-    return;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-  }
+		case SyncedMemory::UNINITIALIZED:
+			return;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+	}
 }
 
-template <> void Blob<unsigned int>::scale_diff(unsigned int scale_factor) {
-  NOT_IMPLEMENTED;
+template<> void Blob<unsigned int>::scale_diff(unsigned int scale_factor) {
+	NOT_IMPLEMENTED;
 }
 
-template <> void Blob<int>::scale_diff(int scale_factor) {
-  NOT_IMPLEMENTED;
+template<> void Blob<int>::scale_diff(int scale_factor) {
+	NOT_IMPLEMENTED;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Blob<Dtype>::scale_diff(Dtype scale_factor) {
-  Dtype* diff;
-  if (!diff_) { return; }
-  switch (diff_->head()) {
-  case SyncedMemory::HEAD_AT_CPU:
-    diff = mutable_cpu_diff();
-    caffe_scal(count_, scale_factor, diff);
-    return;
-  case SyncedMemory::HEAD_AT_GPU:
-  case SyncedMemory::SYNCED:
-#ifndef CPU_ONLY
-    diff = mutable_gpu_diff();
-    caffe_gpu_scal(count_, scale_factor, diff);
-    return;
+	Dtype* diff;
+	if (!diff_) {
+		return;
+	}
+	switch (diff_->head()) {
+		case SyncedMemory::HEAD_AT_CPU:
+			diff = mutable_cpu_diff();
+			caffe_scal(count_, scale_factor, diff);
+			return;
+		case SyncedMemory::HEAD_AT_GPU:
+			case SyncedMemory::SYNCED:
+			#ifndef CPU_ONLY
+			diff = mutable_gpu_diff();
+			caffe_gpu_scal(count_, scale_factor, diff);
+			return;
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-  case SyncedMemory::UNINITIALIZED:
-    return;
-  default:
-    LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
-  }
+		case SyncedMemory::UNINITIALIZED:
+			return;
+		default:
+			LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
-  if (other.has_num() || other.has_channels() ||
-      other.has_height() || other.has_width()) {
-    // Using deprecated 4D Blob dimensions --
-    // shape is (num, channels, height, width).
-    // Note: we do not use the normal Blob::num(), Blob::channels(), etc.
-    // methods as these index from the beginning of the blob shape, where legacy
-    // parameter blobs were indexed from the end of the blob shape (e.g., bias
-    // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
-    return shape_.size() <= 4 &&
-           LegacyShape(-4) == other.num() &&
-           LegacyShape(-3) == other.channels() &&
-           LegacyShape(-2) == other.height() &&
-           LegacyShape(-1) == other.width();
-  }
-  vector<int> other_shape(other.shape().dim_size());
-  for (int i = 0; i < other.shape().dim_size(); ++i) {
-    other_shape[i] = other.shape().dim(i);
-  }
-  return shape_ == other_shape;
-}
-
-template <typename Dtype>
+	if (other.has_num() || other.has_channels() ||
+		other.has_height() || other.has_width()) {
+		// Using deprecated 4D Blob dimensions --
+		// shape is (num, channels, height, width).
+		// Note: we do not use the normal Blob::num(), Blob::channels(), etc.
+		// methods as these index from the beginning of the blob shape, where legacy
+		// parameter blobs were indexed from the end of the blob shape (e.g., bias
+		// Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
+		return shape_.size() <= 4 &&
+			LegacyShape(-4) == other.num() &&
+			LegacyShape(-3) == other.channels() &&
+			LegacyShape(-2) == other.height() &&
+			LegacyShape(-1) == other.width();
+	}
+	vector<int> other_shape(other.shape().dim_size());
+	for (int i = 0; i < other.shape().dim_size(); ++i) {
+		other_shape[i] = other.shape().dim(i);
+	}
+	return shape_ == other_shape;
+}
+
+template<typename Dtype>
 void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
-  if (source.count() != count_ || source.shape() != shape_) {
-    if (reshape) {
-      ReshapeLike(source);
-    } else {
-      LOG(FATAL) << "Trying to copy blobs of different sizes.";
-    }
-  }
-  switch (Caffe::mode()) {
-  case Caffe::GPU:
-    if (copy_diff) {
-      caffe_copy(count_, source.gpu_diff(),
-          static_cast<Dtype*>(diff_->mutable_gpu_data()));
-    } else {
-      caffe_copy(count_, source.gpu_data(),
-          static_cast<Dtype*>(data_->mutable_gpu_data()));
-    }
-    break;
-  case Caffe::CPU:
-    if (copy_diff) {
-      caffe_copy(count_, source.cpu_diff(),
-          static_cast<Dtype*>(diff_->mutable_cpu_data()));
-    } else {
-      caffe_copy(count_, source.cpu_data(),
-          static_cast<Dtype*>(data_->mutable_cpu_data()));
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown caffe mode.";
-  }
-}
-
-template <typename Dtype>
+	if (source.count() != count_ || source.shape() != shape_) {
+		if (reshape) {
+			ReshapeLike(source);
+		} else {
+			LOG(FATAL) << "Trying to copy blobs of different sizes.";
+		}
+	}
+	switch (Caffe::mode()) {
+		case Caffe::GPU:
+			if (copy_diff) {
+				caffe_copy(count_, source.gpu_diff(),
+					static_cast<Dtype*>(diff_->mutable_gpu_data()));
+			} else {
+				caffe_copy(count_, source.gpu_data(),
+					static_cast<Dtype*>(data_->mutable_gpu_data()));
+			}
+			break;
+		case Caffe::CPU:
+			if (copy_diff) {
+				caffe_copy(count_, source.cpu_diff(),
+					static_cast<Dtype*>(diff_->mutable_cpu_data()));
+			} else {
+				caffe_copy(count_, source.cpu_data(),
+					static_cast<Dtype*>(data_->mutable_cpu_data()));
+			}
+			break;
+		default:
+			LOG(FATAL) << "Unknown caffe mode.";
+	}
+}
+
+template<typename Dtype>
 void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
-  if (reshape) {
-    vector<int> shape;
-    if (proto.has_num() || proto.has_channels() ||
-        proto.has_height() || proto.has_width()) {
-      // Using deprecated 4D Blob dimensions --
-      // shape is (num, channels, height, width).
-      shape.resize(4);
-      shape[0] = proto.num();
-      shape[1] = proto.channels();
-      shape[2] = proto.height();
-      shape[3] = proto.width();
-    } else {
-      shape.resize(proto.shape().dim_size());
-      for (int i = 0; i < proto.shape().dim_size(); ++i) {
-        shape[i] = proto.shape().dim(i);
-      }
-    }
-    Reshape(shape);
-  } else {
-    CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";
-  }
-  // copy data
-  Dtype* data_vec = mutable_cpu_data();
-  for (int i = 0; i < count_; ++i) {
-    data_vec[i] = proto.data(i);
-  }
-  if (proto.diff_size() > 0) {
-    Dtype* diff_vec = mutable_cpu_diff();
-    for (int i = 0; i < count_; ++i) {
-      diff_vec[i] = proto.diff(i);
-    }
-  }
-}
-
-template <typename Dtype>
+	if (reshape) {
+		vector<int> shape;
+		if (proto.has_num() || proto.has_channels() ||
+			proto.has_height() || proto.has_width()) {
+			// Using deprecated 4D Blob dimensions --
+			// shape is (num, channels, height, width).
+			shape.resize(4);
+			shape[0] = proto.num();
+			shape[1] = proto.channels();
+			shape[2] = proto.height();
+			shape[3] = proto.width();
+		} else {
+			shape.resize(proto.shape().dim_size());
+			for (int i = 0; i < proto.shape().dim_size(); ++i) {
+				shape[i] = proto.shape().dim(i);
+			}
+		}
+		Reshape(shape);
+	} else {
+		CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";
+	}
+	// copy data
+	Dtype* data_vec = mutable_cpu_data();
+	for (int i = 0; i < count_; ++i) {
+		data_vec[i] = proto.data(i);
+	}
+	if (proto.diff_size() > 0) {
+		Dtype* diff_vec = mutable_cpu_diff();
+		for (int i = 0; i < count_; ++i) {
+			diff_vec[i] = proto.diff(i);
+		}
+	}
+}
+
+template<typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
-  proto->clear_shape();
-  for (int i = 0; i < shape_.size(); ++i) {
-    proto->mutable_shape()->add_dim(shape_[i]);
-  }
-  proto->clear_data();
-  proto->clear_diff();
-  const Dtype* data_vec = cpu_data();
-  for (int i = 0; i < count_; ++i) {
-    proto->add_data(data_vec[i]);
-  }
-  if (write_diff) {
-    const Dtype* diff_vec = cpu_diff();
-    for (int i = 0; i < count_; ++i) {
-      proto->add_diff(diff_vec[i]);
-    }
-  }
-}
-
-INSTANTIATE_CLASS(Blob);
-template class Blob<int>;
-template class Blob<unsigned int>;
+	proto->clear_shape();
+	for (int i = 0; i < shape_.size(); ++i) {
+		proto->mutable_shape()->add_dim(shape_[i]);
+	}
+	proto->clear_data();
+	proto->clear_diff();
+	const Dtype* data_vec = cpu_data();
+	for (int i = 0; i < count_; ++i) {
+		proto->add_data(data_vec[i]);
+	}
+	if (write_diff) {
+		const Dtype* diff_vec = cpu_diff();
+		for (int i = 0; i < count_; ++i) {
+			proto->add_diff(diff_vec[i]);
+		}
+	}
+}
+
+INSTANTIATE_CLASS (Blob);
+template class Blob<int> ;
+template class Blob<unsigned int> ;
 
 }  // namespace caffe
 
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index c1d26ab8..a6ea3a57 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -11,135 +11,142 @@ shared_ptr<Caffe> Caffe::singleton_;
 
 // random seeding
 int64_t cluster_seedgen(void) {
- //To fix: for now we use fixed seed to get same result each time
-/*
-  int64_t s, seed, pid;
-  FILE* f = fopen("/dev/urandom", "rb");
-  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-    fclose(f);
-    return seed;
-  }
-
-  LOG(INFO) << "System entropy source not available, "
-              "using fallback algorithm to generate seed instead.";
-  if (f)
-    fclose(f);
-
-  pid = getpid();
-  s = time(NULL);
-  seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  //return seed;
-  LOG(WARNING) << "return fixed seed 37";
-*/ 
- return 37;
+	//To fix: for now we use fixed seed to get same result each time
+	/*
+	 int64_t s, seed, pid;
+	 FILE* f = fopen("/dev/urandom", "rb");
+	 if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+	 fclose(f);
+	 return seed;
+	 }
+
+	 LOG(INFO) << "System entropy source not available, "
+	 "using fallback algorithm to generate seed instead.";
+	 if (f)
+	 fclose(f);
+
+	 pid = getpid();
+	 s = time(NULL);
+	 seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+	 //return seed;
+	 LOG(WARNING) << "return fixed seed 37";
+	 */
+	return 37;
 }
 
-
 void GlobalInit(int* pargc, char*** pargv) {
-  // Google flags.
-  ::gflags::ParseCommandLineFlags(pargc, pargv, true);
-  // Google logging.
-  ::google::InitGoogleLogging(*(pargv)[0]);
-  // Provide a backtrace on segfault.
-  ::google::InstallFailureSignalHandler();
+	// Google flags.
+	::gflags::ParseCommandLineFlags(pargc, pargv, true);
+	// Google logging.
+	::google::InitGoogleLogging(*(pargv)[0]);
+	// Provide a backtrace on segfault.
+	::google::InstallFailureSignalHandler();
 }
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
 Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU) {
- }
+: random_generator_(), mode_(Caffe::CPU) {
+}
 
-Caffe::~Caffe() { 
+Caffe::~Caffe() {
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
+	// RNG seed
+	Get().random_generator_.reset(new RNG(seed));
 }
 
 void Caffe::SetDevice(const int device_id) {
-  NO_GPU;
+	NO_GPU;
 }
 
 void Caffe::DeviceQuery() {
-  NO_GPU;
+	NO_GPU;
 }
 
-
 class Caffe::RNG::Generator {
- public:
-  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
-  shared_ptr<caffe::rng_t> rng_;
+	public:
+	Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
+	explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
+	caffe::rng_t* rng() {return rng_.get();}
+	private:
+	shared_ptr<caffe::rng_t> rng_;
 };
 
-Caffe::RNG::RNG() : generator_(new Generator()) { }
+Caffe::RNG::RNG() : generator_(new Generator()) {}
 
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
+Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {}
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-  generator_ = other.generator_;
-  return *this;
+	generator_ = other.generator_;
+	return *this;
 }
 
 void* Caffe::RNG::generator() {
-  return static_cast<void*>(generator_->rng());
+	return static_cast<void*>(generator_->rng());
 }
 
 #else  // Normal GPU + CPU Caffe.
 
 Caffe::Caffe()
 {
-   cl_int err =  clblasSetup();
-   if(err != CL_SUCCESS){
-       LOG(ERROR) << "clBLAS setup failed "<<err;
-   }
+	cl_int err = clblasSetup();
+	if (err != CL_SUCCESS) {
+		LOG(ERROR) << "clBLAS setup failed " << err;
+	}
 }
 
 Caffe::~Caffe() {
-   clblasTeardown();
+	clblasTeardown();
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
+	// RNG seed
+	Get().random_generator_.reset(new RNG(seed));
 }
 
 void Caffe::SetDevice(const int device_id) {
-  if (amdDevice.GetDevice() == device_id) {
-    return;
-  }
-  amdDevice.Init(device_id);
+	if (amdDevice.GetDevice() == device_id) {
+		return;
+	}
+	amdDevice.Init(device_id);
 }
 
 void Caffe::DeviceQuery() {
-  amdDevice.DeviceQuery();
+	amdDevice.DeviceQuery();
 }
 
-
 class Caffe::RNG::Generator {
- public:
-  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
-  shared_ptr<caffe::rng_t> rng_;
+	public:
+		Generator()
+			: rng_(new caffe::rng_t(cluster_seedgen())) {
+		}
+		explicit Generator(unsigned int seed)
+			: rng_(new caffe::rng_t(seed)) {
+		}
+		caffe::rng_t* rng() {
+			return rng_.get();
+		}
+	private:
+		shared_ptr<caffe::rng_t> rng_;
 };
 
-Caffe::RNG::RNG() : generator_(new Generator()) { }
+Caffe::RNG::RNG()
+	: generator_(new Generator()) {
+}
 
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
+Caffe::RNG::RNG(unsigned int seed)
+	: generator_(new Generator(seed)) {
+}
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-  generator_.reset(other.generator_.get());
-  return *this;
+	generator_.reset(other.generator_.get());
+	return *this;
 }
 
 void* Caffe::RNG::generator() {
-  return static_cast<void*>(generator_->rng());
+	return static_cast<void*>(generator_->rng());
 }
 
 #endif  // CPU_ONLY
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index f6d80dc2..892d758d 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -12,519 +12,518 @@ namespace caffe {
 
 template<typename Dtype>
 DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
-    Phase phase)
-    : param_(param), phase_(phase) {
-  // check if we want to use mean_file
-  if (param_.has_mean_file()) {
-    CHECK_EQ(param_.mean_value_size(), 0) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    const string& mean_file = param.mean_file();
-    LOG(INFO) << "Loading mean file from: " << mean_file;
-    BlobProto blob_proto;
-    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-    data_mean_.FromProto(blob_proto);
-  }
-  // check if we want to use mean_value
-  if (param_.mean_value_size() > 0) {
-    CHECK(param_.has_mean_file() == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    for (int c = 0; c < param_.mean_value_size(); ++c) {
-      mean_values_.push_back(param_.mean_value(c));
-    }
-  }
+	Phase phase)
+	: param_(param), phase_(phase) {
+	// check if we want to use mean_file
+	if (param_.has_mean_file()) {
+		CHECK_EQ(param_.mean_value_size(), 0) <<
+			"Cannot specify mean_file and mean_value at the same time";
+		const string& mean_file = param.mean_file();
+		LOG(INFO) << "Loading mean file from: " << mean_file;
+		BlobProto blob_proto;
+		ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+		data_mean_.FromProto(blob_proto);
+	}
+	// check if we want to use mean_value
+	if (param_.mean_value_size() > 0) {
+		CHECK(param_.has_mean_file() == false) <<
+			"Cannot specify mean_file and mean_value at the same time";
+		for (int c = 0; c < param_.mean_value_size(); ++c) {
+			mean_values_.push_back(param_.mean_value(c));
+		}
+	}
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Dtype* transformed_data) {
-  const string& data = datum.data();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-
-  const int crop_size = param_.crop_size();
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_uint8 = data.size() > 0;
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  CHECK_GT(datum_channels, 0);
-  CHECK_GE(datum_height, crop_size);
-  CHECK_GE(datum_width, crop_size);
-
-  Dtype* mean = NULL;
-  if (has_mean_file) {
-    CHECK_EQ(datum_channels, data_mean_.channels());
-    CHECK_EQ(datum_height, data_mean_.height());
-    CHECK_EQ(datum_width, data_mean_.width());
-    mean = data_mean_.mutable_cpu_data();
-  }
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << datum_channels;
-    if (datum_channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < datum_channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
-
-  int height = datum_height;
-  int width = datum_width;
-
-  int h_off = 0;
-  int w_off = 0;
-  if (crop_size) {
-    height = crop_size;
-    width = crop_size;
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(datum_height - crop_size + 1);
-      w_off = Rand(datum_width - crop_size + 1);
-    } else {
-      h_off = (datum_height - crop_size) / 2;
-      w_off = (datum_width - crop_size) / 2;
-    }
-  }
-
-  Dtype datum_element;
-  int top_index, data_index;
-  for (int c = 0; c < datum_channels; ++c) {
-    for (int h = 0; h < height; ++h) {
-      for (int w = 0; w < width; ++w) {
-        data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
-        if (do_mirror) {
-          top_index = (c * height + h) * width + (width - 1 - w);
-        } else {
-          top_index = (c * height + h) * width + w;
-        }
-        if (has_uint8) {
-          datum_element =
-            static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
-        } else {
-          datum_element = datum.float_data(data_index);
-        }
-        if (has_mean_file) {
-          transformed_data[top_index] =
-            (datum_element - mean[data_index]) * scale;
-        } else {
-          if (has_mean_values) {
-            transformed_data[top_index] =
-              (datum_element - mean_values_[c]) * scale;
-          } else {
-            transformed_data[top_index] = datum_element * scale;
-          }
-        }
-      }
-    }
-  }
+	Dtype* transformed_data) {
+	const string& data = datum.data();
+	const int datum_channels = datum.channels();
+	const int datum_height = datum.height();
+	const int datum_width = datum.width();
+
+	const int crop_size = param_.crop_size();
+	const Dtype scale = param_.scale();
+	const bool do_mirror = param_.mirror() && Rand(2);
+	const bool has_mean_file = param_.has_mean_file();
+	const bool has_uint8 = data.size() > 0;
+	const bool has_mean_values = mean_values_.size() > 0;
+
+	CHECK_GT(datum_channels, 0);
+	CHECK_GE(datum_height, crop_size);
+	CHECK_GE(datum_width, crop_size);
+
+	Dtype* mean = NULL;
+	if (has_mean_file) {
+		CHECK_EQ(datum_channels, data_mean_.channels());
+		CHECK_EQ(datum_height, data_mean_.height());
+		CHECK_EQ(datum_width, data_mean_.width());
+		mean = data_mean_.mutable_cpu_data();
+	}
+	if (has_mean_values) {
+		CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
+			"Specify either 1 mean_value or as many as channels: " << datum_channels;
+		if (datum_channels > 1 && mean_values_.size() == 1) {
+			// Replicate the mean_value for simplicity
+			for (int c = 1; c < datum_channels; ++c) {
+				mean_values_.push_back(mean_values_[0]);
+			}
+		}
+	}
+
+	int height = datum_height;
+	int width = datum_width;
+
+	int h_off = 0;
+	int w_off = 0;
+	if (crop_size) {
+		height = crop_size;
+		width = crop_size;
+		// We only do random crop when we do training.
+		if (phase_ == TRAIN) {
+			h_off = Rand(datum_height - crop_size + 1);
+			w_off = Rand(datum_width - crop_size + 1);
+		} else {
+			h_off = (datum_height - crop_size) / 2;
+			w_off = (datum_width - crop_size) / 2;
+		}
+	}
+
+	Dtype datum_element;
+	int top_index, data_index;
+	for (int c = 0; c < datum_channels; ++c) {
+		for (int h = 0; h < height; ++h) {
+			for (int w = 0; w < width; ++w) {
+				data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
+				if (do_mirror) {
+					top_index = (c * height + h) * width + (width - 1 - w);
+				} else {
+					top_index = (c * height + h) * width + w;
+				}
+				if (has_uint8) {
+					datum_element =
+						static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
+				} else {
+					datum_element = datum.float_data(data_index);
+				}
+				if (has_mean_file) {
+					transformed_data[top_index] =
+						(datum_element - mean[data_index]) * scale;
+				} else {
+					if (has_mean_values) {
+						transformed_data[top_index] =
+							(datum_element - mean_values_[c]) * scale;
+					} else {
+						transformed_data[top_index] = datum_element * scale;
+					}
+				}
+			}
+		}
+	}
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Blob<Dtype>* transformed_blob) {
-
-  // If datum is encoded, decoded and transform the cv::image.
-  if (datum.encoded()) {
-    CHECK(!(param_.force_color() && param_.force_gray()))
-        << "cannot set both force_color and force_gray";
-    cv::Mat cv_img;
-    if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
-      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-    } else {
-      cv_img = DecodeDatumToCVMatNative(datum);
-    }
-    // Transform the cv::image into blob.
-    return Transform(cv_img, transformed_blob);
-  } else {
-    if (param_.force_color() || param_.force_gray()) {
-      LOG(ERROR) << "force_color and force_gray only for encoded datum";
-    }
-  }
-
-  const int crop_size = param_.crop_size();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-
-  // Check dimensions.
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int num = transformed_blob->num();
-
-  CHECK_EQ(channels, datum_channels);
-  CHECK_LE(height, datum_height);
-  CHECK_LE(width, datum_width);
-  CHECK_GE(num, 1);
-
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-  } else {
-    CHECK_EQ(datum_height, height);
-    CHECK_EQ(datum_width, width);
-  }
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-  Transform(datum, transformed_data);
+	Blob<Dtype>* transformed_blob) {
+
+	// If datum is encoded, decoded and transform the cv::image.
+	if (datum.encoded()) {
+		CHECK(!(param_.force_color() && param_.force_gray()))
+			<< "cannot set both force_color and force_gray";
+		cv::Mat cv_img;
+		if (param_.force_color() || param_.force_gray()) {
+			// If force_color then decode in color otherwise decode in gray.
+			cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+		} else {
+			cv_img = DecodeDatumToCVMatNative(datum);
+		}
+		// Transform the cv::image into blob.
+		return Transform(cv_img, transformed_blob);
+	} else {
+		if (param_.force_color() || param_.force_gray()) {
+			LOG(ERROR) << "force_color and force_gray only for encoded datum";
+		}
+	}
+
+	const int crop_size = param_.crop_size();
+	const int datum_channels = datum.channels();
+	const int datum_height = datum.height();
+	const int datum_width = datum.width();
+
+	// Check dimensions.
+	const int channels = transformed_blob->channels();
+	const int height = transformed_blob->height();
+	const int width = transformed_blob->width();
+	const int num = transformed_blob->num();
+
+	CHECK_EQ(channels, datum_channels);
+	CHECK_LE(height, datum_height);
+	CHECK_LE(width, datum_width);
+	CHECK_GE(num, 1);
+
+	if (crop_size) {
+		CHECK_EQ(crop_size, height);
+		CHECK_EQ(crop_size, width);
+	} else {
+		CHECK_EQ(datum_height, height);
+		CHECK_EQ(datum_width, width);
+	}
+
+	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+	Transform(datum, transformed_data);
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
-                                       Blob<Dtype>* transformed_blob) {
-  const int datum_num = datum_vector.size();
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-
-  CHECK_GT(datum_num, 0) << "There is no datum to add";
-  CHECK_LE(datum_num, num) <<
-    "The size of datum_vector must be no greater than transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
-  for (int item_id = 0; item_id < datum_num; ++item_id) {
-    int offset = transformed_blob->offset(item_id);
-    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-    Transform(datum_vector[item_id], &uni_blob);
-  }
+	Blob<Dtype>* transformed_blob) {
+	const int datum_num = datum_vector.size();
+	const int num = transformed_blob->num();
+	const int channels = transformed_blob->channels();
+	const int height = transformed_blob->height();
+	const int width = transformed_blob->width();
+
+	CHECK_GT(datum_num, 0) << "There is no datum to add";
+	CHECK_LE(datum_num, num) <<
+		"The size of datum_vector must be no greater than transformed_blob->num()";
+	Blob < Dtype > uni_blob(1, channels, height, width);
+	for (int item_id = 0; item_id < datum_num; ++item_id) {
+		int offset = transformed_blob->offset(item_id);
+		uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
+		Transform(datum_vector[item_id], &uni_blob);
+	}
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
-                                       Blob<Dtype>* transformed_blob) {
-  const int mat_num = mat_vector.size();
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-
-  CHECK_GT(mat_num, 0) << "There is no MAT to add";
-  CHECK_EQ(mat_num, num) <<
-    "The size of mat_vector must be equals to transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
-  for (int item_id = 0; item_id < mat_num; ++item_id) {
-    int offset = transformed_blob->offset(item_id);
-    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-    Transform(mat_vector[item_id], &uni_blob);
-  }
+	Blob<Dtype>* transformed_blob) {
+	const int mat_num = mat_vector.size();
+	const int num = transformed_blob->num();
+	const int channels = transformed_blob->channels();
+	const int height = transformed_blob->height();
+	const int width = transformed_blob->width();
+
+	CHECK_GT(mat_num, 0) << "There is no MAT to add";
+	CHECK_EQ(mat_num, num) <<
+		"The size of mat_vector must be equals to transformed_blob->num()";
+	Blob < Dtype > uni_blob(1, channels, height, width);
+	for (int item_id = 0; item_id < mat_num; ++item_id) {
+		int offset = transformed_blob->offset(item_id);
+		uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
+		Transform(mat_vector[item_id], &uni_blob);
+	}
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
-                                       Blob<Dtype>* transformed_blob) {
-  const int crop_size = param_.crop_size();
-  const int img_channels = cv_img.channels();
-  const int img_height = cv_img.rows;
-  const int img_width = cv_img.cols;
-
-  // Check dimensions.
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int num = transformed_blob->num();
-
-  CHECK_EQ(channels, img_channels);
-  CHECK_LE(height, img_height);
-  CHECK_LE(width, img_width);
-  CHECK_GE(num, 1);
-
-  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  CHECK_GT(img_channels, 0);
-  CHECK_GE(img_height, crop_size);
-  CHECK_GE(img_width, crop_size);
-
-  Dtype* mean = NULL;
-  if (has_mean_file) {
-    CHECK_EQ(img_channels, data_mean_.channels());
-    CHECK_EQ(img_height, data_mean_.height());
-    CHECK_EQ(img_width, data_mean_.width());
-    mean = data_mean_.mutable_cpu_data();
-  }
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << img_channels;
-    if (img_channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < img_channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
-
-  int h_off = 0;
-  int w_off = 0;
-  cv::Mat cv_cropped_img = cv_img;
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(img_height - crop_size + 1);
-      w_off = Rand(img_width - crop_size + 1);
-    } else {
-      h_off = (img_height - crop_size) / 2;
-      w_off = (img_width - crop_size) / 2;
-    }
-    cv::Rect roi(w_off, h_off, crop_size, crop_size);
-    cv_cropped_img = cv_img(roi);
-  } else {
-    CHECK_EQ(img_height, height);
-    CHECK_EQ(img_width, width);
-  }
-
-  CHECK(cv_cropped_img.data);
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-  int top_index;
-  for (int h = 0; h < height; ++h) {
-    const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
-    int img_index = 0;
-    for (int w = 0; w < width; ++w) {
-      for (int c = 0; c < img_channels; ++c) {
-        if (do_mirror) {
-          top_index = (c * height + h) * width + (width - 1 - w);
-        } else {
-          top_index = (c * height + h) * width + w;
-        }
-        // int top_index = (c * height + h) * width + w;
-        Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-        if (has_mean_file) {
-          int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
-          transformed_data[top_index] =
-            (pixel - mean[mean_index]) * scale;
-        } else {
-          if (has_mean_values) {
-            transformed_data[top_index] =
-              (pixel - mean_values_[c]) * scale;
-          } else {
-            transformed_data[top_index] = pixel * scale;
-          }
-        }
-      }
-    }
-  }
+	Blob<Dtype>* transformed_blob) {
+	const int crop_size = param_.crop_size();
+	const int img_channels = cv_img.channels();
+	const int img_height = cv_img.rows;
+	const int img_width = cv_img.cols;
+
+	// Check dimensions.
+	const int channels = transformed_blob->channels();
+	const int height = transformed_blob->height();
+	const int width = transformed_blob->width();
+	const int num = transformed_blob->num();
+
+	CHECK_EQ(channels, img_channels);
+	CHECK_LE(height, img_height);
+	CHECK_LE(width, img_width);
+	CHECK_GE(num, 1);
+
+	CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+
+	const Dtype scale = param_.scale();
+	const bool do_mirror = param_.mirror() && Rand(2);
+	const bool has_mean_file = param_.has_mean_file();
+	const bool has_mean_values = mean_values_.size() > 0;
+
+	CHECK_GT(img_channels, 0);
+	CHECK_GE(img_height, crop_size);
+	CHECK_GE(img_width, crop_size);
+
+	Dtype* mean = NULL;
+	if (has_mean_file) {
+		CHECK_EQ(img_channels, data_mean_.channels());
+		CHECK_EQ(img_height, data_mean_.height());
+		CHECK_EQ(img_width, data_mean_.width());
+		mean = data_mean_.mutable_cpu_data();
+	}
+	if (has_mean_values) {
+		CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
+			"Specify either 1 mean_value or as many as channels: " << img_channels;
+		if (img_channels > 1 && mean_values_.size() == 1) {
+			// Replicate the mean_value for simplicity
+			for (int c = 1; c < img_channels; ++c) {
+				mean_values_.push_back(mean_values_[0]);
+			}
+		}
+	}
+
+	int h_off = 0;
+	int w_off = 0;
+	cv::Mat cv_cropped_img = cv_img;
+	if (crop_size) {
+		CHECK_EQ(crop_size, height);
+		CHECK_EQ(crop_size, width);
+		// We only do random crop when we do training.
+		if (phase_ == TRAIN) {
+			h_off = Rand(img_height - crop_size + 1);
+			w_off = Rand(img_width - crop_size + 1);
+		} else {
+			h_off = (img_height - crop_size) / 2;
+			w_off = (img_width - crop_size) / 2;
+		}
+		cv::Rect roi(w_off, h_off, crop_size, crop_size);
+		cv_cropped_img = cv_img(roi);
+	} else {
+		CHECK_EQ(img_height, height);
+		CHECK_EQ(img_width, width);
+	}
+
+	CHECK(cv_cropped_img.data);
+
+	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+	int top_index;
+	for (int h = 0; h < height; ++h) {
+		const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
+		int img_index = 0;
+		for (int w = 0; w < width; ++w) {
+			for (int c = 0; c < img_channels; ++c) {
+				if (do_mirror) {
+					top_index = (c * height + h) * width + (width - 1 - w);
+				} else {
+					top_index = (c * height + h) * width + w;
+				}
+				// int top_index = (c * height + h) * width + w;
+				Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
+				if (has_mean_file) {
+					int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
+					transformed_data[top_index] =
+						(pixel - mean[mean_index]) * scale;
+				} else {
+					if (has_mean_values) {
+						transformed_data[top_index] =
+							(pixel - mean_values_[c]) * scale;
+					} else {
+						transformed_data[top_index] = pixel * scale;
+					}
+				}
+			}
+		}
+	}
 }
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
-                                       Blob<Dtype>* transformed_blob) {
-  const int crop_size = param_.crop_size();
-  const int input_num = input_blob->num();
-  const int input_channels = input_blob->channels();
-  const int input_height = input_blob->height();
-  const int input_width = input_blob->width();
-
-  if (transformed_blob->count() == 0) {
-    // Initialize transformed_blob with the right shape.
-    if (crop_size) {
-      transformed_blob->Reshape(input_num, input_channels,
-                                crop_size, crop_size);
-    } else {
-      transformed_blob->Reshape(input_num, input_channels,
-                                input_height, input_width);
-    }
-  }
-
-  const int num = transformed_blob->num();
-  const int channels = transformed_blob->channels();
-  const int height = transformed_blob->height();
-  const int width = transformed_blob->width();
-  const int size = transformed_blob->count();
-
-  CHECK_LE(input_num, num);
-  CHECK_EQ(input_channels, channels);
-  CHECK_GE(input_height, height);
-  CHECK_GE(input_width, width);
-
-
-  const Dtype scale = param_.scale();
-  const bool do_mirror = param_.mirror() && Rand(2);
-  const bool has_mean_file = param_.has_mean_file();
-  const bool has_mean_values = mean_values_.size() > 0;
-
-  int h_off = 0;
-  int w_off = 0;
-  if (crop_size) {
-    CHECK_EQ(crop_size, height);
-    CHECK_EQ(crop_size, width);
-    // We only do random crop when we do training.
-    if (phase_ == TRAIN) {
-      h_off = Rand(input_height - crop_size + 1);
-      w_off = Rand(input_width - crop_size + 1);
-    } else {
-      h_off = (input_height - crop_size) / 2;
-      w_off = (input_width - crop_size) / 2;
-    }
-  } else {
-    CHECK_EQ(input_height, height);
-    CHECK_EQ(input_width, width);
-  }
-
-  Dtype* input_data = input_blob->mutable_cpu_data();
-  if (has_mean_file) {
-    CHECK_EQ(input_channels, data_mean_.channels());
-    CHECK_EQ(input_height, data_mean_.height());
-    CHECK_EQ(input_width, data_mean_.width());
-    for (int n = 0; n < input_num; ++n) {
-      int offset = input_blob->offset(n);
-      caffe_sub(data_mean_.count(), input_data + offset,
-            data_mean_.cpu_data(), input_data + offset);
-    }
-  }
-
-  if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << input_channels;
-    if (mean_values_.size() == 1) {
-      caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
-    } else {
-      for (int n = 0; n < input_num; ++n) {
-        for (int c = 0; c < input_channels; ++c) {
-          int offset = input_blob->offset(n, c);
-          caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
-            input_data + offset);
-        }
-      }
-    }
-  }
-
-  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-
-  for (int n = 0; n < input_num; ++n) {
-    int top_index_n = n * channels;
-    int data_index_n = n * channels;
-    for (int c = 0; c < channels; ++c) {
-      int top_index_c = (top_index_n + c) * height;
-      int data_index_c = (data_index_n + c) * input_height + h_off;
-      for (int h = 0; h < height; ++h) {
-        int top_index_h = (top_index_c + h) * width;
-        int data_index_h = (data_index_c + h) * input_width + w_off;
-        if (do_mirror) {
-          int top_index_w = top_index_h + width - 1;
-          for (int w = 0; w < width; ++w) {
-            transformed_data[top_index_w-w] = input_data[data_index_h + w];
-          }
-        } else {
-          for (int w = 0; w < width; ++w) {
-            transformed_data[top_index_h + w] = input_data[data_index_h + w];
-          }
-        }
-      }
-    }
-  }
-  if (scale != Dtype(1)) {
-    DLOG(INFO) << "Scale: " << scale;
-    caffe_scal(size, scale, transformed_data);
-  }
+	Blob<Dtype>* transformed_blob) {
+	const int crop_size = param_.crop_size();
+	const int input_num = input_blob->num();
+	const int input_channels = input_blob->channels();
+	const int input_height = input_blob->height();
+	const int input_width = input_blob->width();
+
+	if (transformed_blob->count() == 0) {
+		// Initialize transformed_blob with the right shape.
+		if (crop_size) {
+			transformed_blob->Reshape(input_num, input_channels,
+				crop_size, crop_size);
+		} else {
+			transformed_blob->Reshape(input_num, input_channels,
+				input_height, input_width);
+		}
+	}
+
+	const int num = transformed_blob->num();
+	const int channels = transformed_blob->channels();
+	const int height = transformed_blob->height();
+	const int width = transformed_blob->width();
+	const int size = transformed_blob->count();
+
+	CHECK_LE(input_num, num);
+	CHECK_EQ(input_channels, channels);
+	CHECK_GE(input_height, height);
+	CHECK_GE(input_width, width);
+
+	const Dtype scale = param_.scale();
+	const bool do_mirror = param_.mirror() && Rand(2);
+	const bool has_mean_file = param_.has_mean_file();
+	const bool has_mean_values = mean_values_.size() > 0;
+
+	int h_off = 0;
+	int w_off = 0;
+	if (crop_size) {
+		CHECK_EQ(crop_size, height);
+		CHECK_EQ(crop_size, width);
+		// We only do random crop when we do training.
+		if (phase_ == TRAIN) {
+			h_off = Rand(input_height - crop_size + 1);
+			w_off = Rand(input_width - crop_size + 1);
+		} else {
+			h_off = (input_height - crop_size) / 2;
+			w_off = (input_width - crop_size) / 2;
+		}
+	} else {
+		CHECK_EQ(input_height, height);
+		CHECK_EQ(input_width, width);
+	}
+
+	Dtype* input_data = input_blob->mutable_cpu_data();
+	if (has_mean_file) {
+		CHECK_EQ(input_channels, data_mean_.channels());
+		CHECK_EQ(input_height, data_mean_.height());
+		CHECK_EQ(input_width, data_mean_.width());
+		for (int n = 0; n < input_num; ++n) {
+			int offset = input_blob->offset(n);
+			caffe_sub(data_mean_.count(), input_data + offset,
+				data_mean_.cpu_data(), input_data + offset);
+		}
+	}
+
+	if (has_mean_values) {
+		CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
+			"Specify either 1 mean_value or as many as channels: " << input_channels;
+		if (mean_values_.size() == 1) {
+			caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
+		} else {
+			for (int n = 0; n < input_num; ++n) {
+				for (int c = 0; c < input_channels; ++c) {
+					int offset = input_blob->offset(n, c);
+					caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
+						input_data + offset);
+				}
+			}
+		}
+	}
+
+	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+
+	for (int n = 0; n < input_num; ++n) {
+		int top_index_n = n * channels;
+		int data_index_n = n * channels;
+		for (int c = 0; c < channels; ++c) {
+			int top_index_c = (top_index_n + c) * height;
+			int data_index_c = (data_index_n + c) * input_height + h_off;
+			for (int h = 0; h < height; ++h) {
+				int top_index_h = (top_index_c + h) * width;
+				int data_index_h = (data_index_c + h) * input_width + w_off;
+				if (do_mirror) {
+					int top_index_w = top_index_h + width - 1;
+					for (int w = 0; w < width; ++w) {
+						transformed_data[top_index_w - w] = input_data[data_index_h + w];
+					}
+				} else {
+					for (int w = 0; w < width; ++w) {
+						transformed_data[top_index_h + w] = input_data[data_index_h + w];
+					}
+				}
+			}
+		}
+	}
+	if (scale != Dtype(1)) {
+		DLOG(INFO) << "Scale: " << scale;
+		caffe_scal(size, scale, transformed_data);
+	}
 }
 
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
-  if (datum.encoded()) {
-    CHECK(!(param_.force_color() && param_.force_gray()))
-        << "cannot set both force_color and force_gray";
-    cv::Mat cv_img;
-    if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
-      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-    } else {
-      cv_img = DecodeDatumToCVMatNative(datum);
-    }
-    // InferBlobShape using the cv::image.
-    return InferBlobShape(cv_img);
-  }
-
-  const int crop_size = param_.crop_size();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
-  // Check dimensions.
-  CHECK_GT(datum_channels, 0);
-  CHECK_GE(datum_height, crop_size);
-  CHECK_GE(datum_width, crop_size);
-  // Build BlobShape.
-  vector<int> shape(4);
-  shape[0] = 1;
-  shape[1] = datum_channels;
-  shape[2] = (crop_size)? crop_size: datum_height;
-  shape[3] = (crop_size)? crop_size: datum_width;
-  return shape;
+	if (datum.encoded()) {
+		CHECK(!(param_.force_color() && param_.force_gray()))
+			<< "cannot set both force_color and force_gray";
+		cv::Mat cv_img;
+		if (param_.force_color() || param_.force_gray()) {
+			// If force_color then decode in color otherwise decode in gray.
+			cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+		} else {
+			cv_img = DecodeDatumToCVMatNative(datum);
+		}
+		// InferBlobShape using the cv::image.
+		return InferBlobShape(cv_img);
+	}
+
+	const int crop_size = param_.crop_size();
+	const int datum_channels = datum.channels();
+	const int datum_height = datum.height();
+	const int datum_width = datum.width();
+	// Check dimensions.
+	CHECK_GT(datum_channels, 0);
+	CHECK_GE(datum_height, crop_size);
+	CHECK_GE(datum_width, crop_size);
+	// Build BlobShape.
+	vector<int> shape(4);
+	shape[0] = 1;
+	shape[1] = datum_channels;
+	shape[2] = (crop_size) ? crop_size : datum_height;
+	shape[3] = (crop_size) ? crop_size : datum_width;
+	return shape;
 }
 
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-    const vector<Datum> & datum_vector) {
-  const int num = datum_vector.size();
-  CHECK_GT(num, 0) << "There is no datum to in the vector";
-  // Use first datum in the vector to InferBlobShape.
-  vector<int> shape = InferBlobShape(datum_vector[0]);
-  // Adjust num to the size of the vector.
-  shape[0] = num;
-  return shape;
+	const vector<Datum> & datum_vector) {
+	const int num = datum_vector.size();
+	CHECK_GT(num, 0) << "There is no datum to in the vector";
+	// Use first datum in the vector to InferBlobShape.
+	vector<int> shape = InferBlobShape(datum_vector[0]);
+	// Adjust num to the size of the vector.
+	shape[0] = num;
+	return shape;
 }
 
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
-  const int crop_size = param_.crop_size();
-  const int img_channels = cv_img.channels();
-  const int img_height = cv_img.rows;
-  const int img_width = cv_img.cols;
-  // Check dimensions.
-  CHECK_GT(img_channels, 0);
-  CHECK_GE(img_height, crop_size);
-  CHECK_GE(img_width, crop_size);
-  // Build BlobShape.
-  vector<int> shape(4);
-  shape[0] = 1;
-  shape[1] = img_channels;
-  shape[2] = (crop_size)? crop_size: img_height;
-  shape[3] = (crop_size)? crop_size: img_width;
-  return shape;
+	const int crop_size = param_.crop_size();
+	const int img_channels = cv_img.channels();
+	const int img_height = cv_img.rows;
+	const int img_width = cv_img.cols;
+	// Check dimensions.
+	CHECK_GT(img_channels, 0);
+	CHECK_GE(img_height, crop_size);
+	CHECK_GE(img_width, crop_size);
+	// Build BlobShape.
+	vector<int> shape(4);
+	shape[0] = 1;
+	shape[1] = img_channels;
+	shape[2] = (crop_size) ? crop_size : img_height;
+	shape[3] = (crop_size) ? crop_size : img_width;
+	return shape;
 }
 
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-    const vector<cv::Mat> & mat_vector) {
-  const int num = mat_vector.size();
-  CHECK_GT(num, 0) << "There is no cv_img to in the vector";
-  // Use first cv_img in the vector to InferBlobShape.
-  vector<int> shape = InferBlobShape(mat_vector[0]);
-  // Adjust num to the size of the vector.
-  shape[0] = num;
-  return shape;
+	const vector<cv::Mat> & mat_vector) {
+	const int num = mat_vector.size();
+	CHECK_GT(num, 0) << "There is no cv_img to in the vector";
+	// Use first cv_img in the vector to InferBlobShape.
+	vector<int> shape = InferBlobShape(mat_vector[0]);
+	// Adjust num to the size of the vector.
+	shape[0] = num;
+	return shape;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
-  const bool needs_rand = param_.mirror() ||
-      (phase_ == TRAIN && param_.crop_size());
-  if (needs_rand) {
-    const unsigned int rng_seed = caffe_rng_rand();
-    rng_.reset(new Caffe::RNG(rng_seed));
-  } else {
-    rng_.reset();
-  }
+	const bool needs_rand = param_.mirror() ||
+		(phase_ == TRAIN && param_.crop_size());
+	if (needs_rand) {
+		const unsigned int rng_seed = caffe_rng_rand();
+		rng_.reset(new Caffe::RNG(rng_seed));
+	} else {
+		rng_.reset();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 int DataTransformer<Dtype>::Rand(int n) {
-  CHECK(rng_);
-  CHECK_GT(n, 0);
-  caffe::rng_t* rng =
-      static_cast<caffe::rng_t*>(rng_->generator());
-  return ((*rng)() % n);
+	CHECK (rng_);
+	CHECK_GT(n, 0);
+	caffe::rng_t* rng =
+		static_cast<caffe::rng_t*>(rng_->generator());
+	return ((*rng)() % n);
 }
 
-INSTANTIATE_CLASS(DataTransformer);
+INSTANTIATE_CLASS (DataTransformer);
 
 }  // namespace caffe
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 7e745410..689f706e 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -37,378 +37,415 @@ string buildOption = "-x clc++ ";
 std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
 
-Device::~Device(){
-    ReleaseKernels(); 
-    free((void*)platformIDs);
-    free(DeviceIDs);
-    clReleaseProgram(Program);
-    clReleaseCommandQueue(CommandQueue);
-    clReleaseCommandQueue(CommandQueue_helper);
-    clReleaseContext(Context);
-    LOG(INFO) << "device destructor";
+Device::~Device() {
+	ReleaseKernels();
+	free((void*) platformIDs);
+	free (DeviceIDs);
+	clReleaseProgram (Program);
+	clReleaseCommandQueue (CommandQueue);
+	clReleaseCommandQueue (CommandQueue_helper);
+	clReleaseContext (Context);
+	LOG(INFO) << "device destructor";
 }
 
-
-cl_int Device::Init(int deviceId){
-
-    DisplayPlatformInfo();
-  
-    clGetPlatformIDs(0, NULL, &numPlatforms);
-    cl_platform_id PlatformIDs[numPlatforms];
-    clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
-    
-    size_t nameLen;
-    cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
-    if(res != CL_SUCCESS){
-        fprintf(stderr, "Err: Failed to Get Platform Info\n");
-        return 0;
-    }
-    platformName[nameLen] = 0;
-
-    GetDeviceInfo();
-    cl_uint uiNumDevices;
-    cl_bool unified_memory = false;
-    clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-    uiNumDevices = numDevices;
-    if(0 == uiNumDevices){
-        LOG(FATAL) << "Err: No GPU devices";
-    } else {
-        pDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
-        OCL_CHECK(clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices, &uiNumDevices));
-        if (deviceId == -1) { 
-            int i;
-	    for (i = 0; i < (int)uiNumDevices; i++){
-                clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &unified_memory, NULL);
-                if(!unified_memory) { //skip iGPU
-                    //we pick the first dGPU we found
-                    pDevices[0] = pDevices[i];
-                    device_id = i;
-                    LOG(INFO) << "Picked default device type : dGPU "<<device_id;
-                    break;
-                }
-            }
-	    if (i == uiNumDevices) {
-                LOG(FATAL) << "Cannot find any dGPU! ";
-            }
-        } else if (deviceId >=0 && deviceId < uiNumDevices){
-            pDevices[0] = pDevices[deviceId];
-            device_id = deviceId;
-            LOG(INFO) << "Picked device type : GPU "<<device_id;
-        } else {
-            LOG(FATAL) << "  Invalid GPU deviceId! ";
-        }
-   }
-
-    Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
-    if(NULL == Context){
-        fprintf(stderr,"Err: Failed to Create Context\n");
-        return 0;
-    }
-    CommandQueue = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
-    CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], CL_QUEUE_PROFILING_ENABLE, NULL);
-    if(NULL == CommandQueue || NULL == CommandQueue_helper){
-        fprintf(stderr,"Err: Failed to Create Commandqueue\n");
-        return 0;
-    }
-    BuildProgram(oclKernelPath);
-    row = clblasRowMajor;
-    col = clblasColumnMajor;
-    return 0;
+cl_int Device::Init(int deviceId) {
+
+	DisplayPlatformInfo();
+
+	clGetPlatformIDs(0, NULL, &numPlatforms);
+	cl_platform_id PlatformIDs[numPlatforms];
+	clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+	size_t nameLen;
+	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+		platformName, &nameLen);
+	if (res != CL_SUCCESS) {
+		fprintf(stderr, "Err: Failed to Get Platform Info\n");
+		return 0;
+	}
+	platformName[nameLen] = 0;
+
+	GetDeviceInfo();
+	cl_uint uiNumDevices;
+	cl_bool unified_memory = false;
+	clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+	uiNumDevices = numDevices;
+	if (0 == uiNumDevices) {
+		LOG(FATAL) << "Err: No GPU devices";
+	} else {
+		pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id));
+		OCL_CHECK(
+			clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices,
+				&uiNumDevices));
+		if (deviceId == -1) {
+			int i;
+			for (i = 0; i < (int) uiNumDevices; i++) {
+				clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY,
+					sizeof(cl_bool), &unified_memory, NULL);
+				if (!unified_memory) { //skip iGPU
+					//we pick the first dGPU we found
+					pDevices[0] = pDevices[i];
+					device_id = i;
+					LOG(INFO) << "Picked default device type : dGPU " << device_id;
+					break;
+				}
+			}
+			if (i == uiNumDevices) {
+				LOG(FATAL) << "Cannot find any dGPU! ";
+			}
+		} else if (deviceId >= 0 && deviceId < uiNumDevices) {
+			pDevices[0] = pDevices[deviceId];
+			device_id = deviceId;
+			LOG(INFO) << "Picked device type : GPU " << device_id;
+		} else {
+			LOG(FATAL) << "  Invalid GPU deviceId! ";
+		}
+	}
+
+	Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
+	if (NULL == Context) {
+		fprintf(stderr, "Err: Failed to Create Context\n");
+		return 0;
+	}
+	CommandQueue = clCreateCommandQueue(Context, pDevices[0],
+		CL_QUEUE_PROFILING_ENABLE, NULL);
+	CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0],
+		CL_QUEUE_PROFILING_ENABLE, NULL);
+	if (NULL == CommandQueue || NULL == CommandQueue_helper) {
+		fprintf(stderr, "Err: Failed to Create Commandqueue\n");
+		return 0;
+	}
+	BuildProgram (oclKernelPath);
+	row = clblasRowMajor;
+	col = clblasColumnMajor;
+	return 0;
 }
 
 void Device::BuildProgram(std::string kernel_dir)
-{ 
-    std::string strSource = "";
-    DIR *ocl_dir;
-    struct dirent *dirp;
-    if((ocl_dir=opendir(kernel_dir.c_str())) == NULL)
-    {
-        fprintf(stderr,"Err: Open ocl dir failed!\n");
-    }
-    while((dirp = readdir(ocl_dir)) != NULL)
-    {  
-        //Ignore hidden files
-        if(dirp->d_name[0] == '.') continue;
-        std::string file_name = std::string(dirp->d_name);
-        //Skip non *.cl files
-        size_t last_dot_pos = file_name.find_last_of(".");
-        if(file_name.substr(last_dot_pos+1) != "cl") continue;
-
-        std::string ocl_kernel_full_path=kernel_dir+file_name;
-        std::string tmpSource = "";
-        ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
-        strSource += tmpSource;
-    }
-    const char *pSource;
-    pSource = strSource.c_str();
-    size_t uiArrSourceSize[] = {0};
-    uiArrSourceSize[0] = strlen(pSource);
-    Program = NULL;
-    Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, NULL);
-    if(NULL == Program){
-        fprintf(stderr,"Err: Failed to create program\n");
-    }
-    cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), NULL, NULL);
-    LOG(INFO) << "Build Program";
-    if(CL_SUCCESS != iStatus){
-        fprintf(stderr,"Err: Failed to build program\n");
-        char szBuildLog[16384];
-        clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, sizeof(szBuildLog), szBuildLog, NULL);
-        std::cout << szBuildLog;
-        clReleaseProgram(Program);
-    }
+	{
+	std::string strSource = "";
+	DIR *ocl_dir;
+	struct dirent *dirp;
+	if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL)
+		{
+		fprintf(stderr, "Err: Open ocl dir failed!\n");
+	}
+	while ((dirp = readdir(ocl_dir)) != NULL)
+	{
+		//Ignore hidden files
+		if (dirp->d_name[0] == '.')
+			continue;
+		std::string file_name = std::string(dirp->d_name);
+		//Skip non *.cl files
+		size_t last_dot_pos = file_name.find_last_of(".");
+		if (file_name.substr(last_dot_pos + 1) != "cl")
+			continue;
+
+		std::string ocl_kernel_full_path = kernel_dir + file_name;
+		std::string tmpSource = "";
+		ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
+		strSource += tmpSource;
+	}
+	const char *pSource;
+	pSource = strSource.c_str();
+	size_t uiArrSourceSize[] = { 0 };
+	uiArrSourceSize[0] = strlen(pSource);
+	Program = NULL;
+	Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize,
+		NULL);
+	if (NULL == Program) {
+		fprintf(stderr, "Err: Failed to create program\n");
+	}
+	cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(),
+		NULL, NULL);
+	LOG(INFO) << "Build Program";
+	if (CL_SUCCESS != iStatus) {
+		fprintf(stderr, "Err: Failed to build program\n");
+		char szBuildLog[16384];
+		clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG,
+			sizeof(szBuildLog), szBuildLog, NULL);
+		std::cout << szBuildLog;
+		clReleaseProgram (Program);
+	}
 }
 
 //Use to read OpenCL source code
-cl_int Device::ConvertToString(std::string pFileName,std::string &Str){
-    size_t uiSize=0;
-    size_t uiFileSize=0;
-    char *pStr=NULL;
-    char *tmp = (char*)pFileName.data();
-    std::fstream fFile(tmp,(std::fstream::in|std::fstream::binary));
-    if(fFile.is_open()){
-        fFile.seekg(0,std::fstream::end);
-        uiSize=uiFileSize=(size_t)fFile.tellg();
-        fFile.seekg(0,std::fstream::beg);
-        pStr=new char[uiSize+1];
-
-        if(NULL==pStr){
-            fFile.close();
-            return 0;
-        }
-        fFile.read(pStr,uiFileSize);
-        fFile.close();
-        pStr[uiSize]='\0';
-        Str=pStr;
-        delete[] pStr;
-        return 0;
-    }
-    LOG(ERROR) << "Err: Failed to open cl file!";
-    return -1;
+cl_int Device::ConvertToString(std::string pFileName, std::string &Str) {
+	size_t uiSize = 0;
+	size_t uiFileSize = 0;
+	char *pStr = NULL;
+	char *tmp = (char*) pFileName.data();
+	std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary));
+	if (fFile.is_open()) {
+		fFile.seekg(0, std::fstream::end);
+		uiSize = uiFileSize = (size_t) fFile.tellg();
+		fFile.seekg(0, std::fstream::beg);
+		pStr = new char[uiSize + 1];
+
+		if (NULL == pStr) {
+			fFile.close();
+			return 0;
+		}
+		fFile.read(pStr, uiFileSize);
+		fFile.close();
+		pStr[uiSize] = '\0';
+		Str = pStr;
+		delete[] pStr;
+		return 0;
+	}
+	LOG(ERROR) << "Err: Failed to open cl file!";
+	return -1;
 }
 
 cl_kernel Device::GetKernel(std::string kernel_name)
-{
-    std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
-    if (it == Kernels.end())
-    {
-        cl_int _err=0;
-        cl_kernel kernel = clCreateKernel(Program,kernel_name.c_str(),&_err);
-        OCL_CHECK(_err);
-        Kernels[kernel_name] = kernel;
-    }
-    return Kernels[kernel_name];
+	{
+	std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
+	if (it == Kernels.end())
+		{
+		cl_int _err = 0;
+		cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err);
+		OCL_CHECK(_err);
+		Kernels[kernel_name] = kernel;
+	}
+	return Kernels[kernel_name];
 }
 
 void Device::ReleaseKernels()
 {
-    std::map<std::string, cl_kernel>::iterator it;
-    for (it = Kernels.begin(); it != Kernels.end(); it++)
-    {
-        clReleaseKernel(it->second);
-    }
+	std::map<std::string, cl_kernel>::iterator it;
+	for (it = Kernels.begin(); it != Kernels.end(); it++)
+		{
+		clReleaseKernel(it->second);
+	}
 }
 
-void Device::DisplayPlatformInfo(){
-   cl_int err;
-
-   err = clGetPlatformIDs (0, NULL, &numPlatforms);
-   if (err != CL_SUCCESS || numPlatforms <=0)
-   {
-      LOG(ERROR) << "Failed to find any OpenCL platform.";
-      return;
-   }
-
-   platformIDs = (cl_platform_id *) malloc (sizeof(cl_platform_id) * numPlatforms);
-   err = clGetPlatformIDs (numPlatforms, platformIDs, NULL);
-   if(err != CL_SUCCESS)
-   {
-      LOG(ERROR) << "Failed to find any OpenCL platform.";
-      return;
-   }
-
-   LOG(INFO) << "Number of platforms found:" << numPlatforms;
-
-  //iterate through the list of platforms displaying platform information
-  for (cl_uint i = 0; i < numPlatforms; i++ ){
-      DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
-      DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
-      DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
-      DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
-      DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS");
-  }
-   
+void Device::DisplayPlatformInfo() {
+	cl_int err;
+
+	err = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (err != CL_SUCCESS || numPlatforms <= 0)
+		{
+		LOG(ERROR) << "Failed to find any OpenCL platform.";
+		return;
+	}
+
+	platformIDs = (cl_platform_id *) malloc(
+		sizeof(cl_platform_id) * numPlatforms);
+	err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
+	if (err != CL_SUCCESS)
+		{
+		LOG(ERROR) << "Failed to find any OpenCL platform.";
+		return;
+	}
+
+	LOG(INFO) << "Number of platforms found:" << numPlatforms;
+
+	//iterate through the list of platforms displaying platform information
+	for (cl_uint i = 0; i < numPlatforms; i++) {
+		DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
+		DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
+		DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
+		DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
+		DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS,
+			"CL_PLATFORM_EXTENSIONS");
+	}
+
 }
 
-void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str){
-    cl_int err;
-    std::size_t paramValueSize;
-
-    err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);  
-   if(err != CL_SUCCESS)
-   {
-      LOG(ERROR) << "Failed to find OpenCL platform:" << str;
-      return;
-   }
-   
-   char * info = (char *) alloca (sizeof(char) * paramValueSize);
-   err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
-   if(err != CL_SUCCESS)
-   {
-      LOG(ERROR) << "Failed to find OpenCL platform:" << str;
-      return;
-   }
-
-   LOG(INFO) << "\t" << str << "\t" << info;
+void Device::DisplayInfo(cl_platform_id id, cl_platform_info name,
+	std::string str) {
+	cl_int err;
+	std::size_t paramValueSize;
+
+	err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);
+	if (err != CL_SUCCESS)
+		{
+		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+		return;
+	}
+
+	char * info = (char *) alloca(sizeof(char) * paramValueSize);
+	err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
+	if (err != CL_SUCCESS)
+		{
+		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+		return;
+	}
+
+	LOG(INFO) << "\t" << str << "\t" << info;
 }
 
-void Device::GetDeviceInfo(){
-    cl_int err;
-    //by default, we select the first platform. can be extended for more platforms
-    //query GPU device for now
-    err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-    // we allow program run if no GPU is found. Just return. No error reported.
-    if (numDevices < 1)
-    {
-        LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
-        LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
-        return;
-    }
-    
-    DeviceIDs = (cl_device_id *) malloc (sizeof(cl_device_id) * numDevices);
-    err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, DeviceIDs, NULL);
-    if (err != CL_SUCCESS)
-    {
-        LOG(INFO) << "Failed to find any GPU devices.";
-        return;
-    }
-
-    LOG(INFO) << "Number of devices found:" << numDevices;
-    for (cl_uint i = 0; i < numDevices; i++) {
-        LOG(INFO) << "\t" << "DeviceID" << ":\t" <<DeviceIDs[i];
-        DisplayDeviceInfo<cl_device_type>(DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
-        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
-        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
-        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
-        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
-        DisplayDeviceInfo<cl_bool>(DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
-        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
-        DisplayDeviceInfo<size_t>(DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
-        DisplayDeviceInfo<cl_uint>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
-        DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "Max work item sizes");
-        DisplayDeviceInfo<cl_command_queue_properties>(DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
-        DisplayDeviceInfo<cl_device_exec_capabilities>(DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
-        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
-        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
-        DisplayDeviceInfo<cl_ulong>(DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
-    }
-    
-    
+void Device::GetDeviceInfo() {
+	cl_int err;
+	//by default, we select the first platform. can be extended for more platforms
+	//query GPU device for now
+	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL,
+		&numDevices);
+	// we allow program run if no GPU is found. Just return. No error reported.
+	if (numDevices < 1)
+		{
+		LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
+		LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
+		return;
+	}
+
+	DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
+	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices,
+		DeviceIDs, NULL);
+	if (err != CL_SUCCESS)
+		{
+		LOG(INFO) << "Failed to find any GPU devices.";
+		return;
+	}
+
+	LOG(INFO) << "Number of devices found:" << numDevices;
+	for (cl_uint i = 0; i < numDevices; i++) {
+		LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i];
+		DisplayDeviceInfo < cl_device_type
+			> (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+		DisplayDeviceInfo < cl_bool
+			> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+		DisplayDeviceInfo < cl_uint
+			> (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+		DisplayDeviceInfo < cl_bool
+			> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+		DisplayDeviceInfo < cl_bool
+			> (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+		DisplayDeviceInfo < cl_bool
+			> (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+		DisplayDeviceInfo < cl_uint
+			> (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+		DisplayDeviceInfo < size_t
+			> (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+		DisplayDeviceInfo < cl_uint
+			> (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+		DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES,
+			"Max work item sizes");
+		DisplayDeviceInfo < cl_command_queue_properties
+			> (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+		DisplayDeviceInfo < cl_device_exec_capabilities
+			> (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+		DisplayDeviceInfo < cl_ulong
+			> (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+		DisplayDeviceInfo < cl_ulong
+			> (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+		DisplayDeviceInfo < cl_ulong
+			> (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+	}
+
 }
 
 void Device::DeviceQuery()
 {
-    DisplayPlatformInfo();
-
-    clGetPlatformIDs(0, NULL, &numPlatforms);
-    cl_platform_id PlatformIDs[numPlatforms];
-    clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
-
-    size_t nameLen;
-    cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, platformName, &nameLen);
-    if (res != CL_SUCCESS) {
-        fprintf(stderr, "Err: Failed to Get Platform Info\n");
-        return;
-    }
-    platformName[nameLen] = 0;
-
-    GetDeviceInfo();
+	DisplayPlatformInfo();
+
+	clGetPlatformIDs(0, NULL, &numPlatforms);
+	cl_platform_id PlatformIDs[numPlatforms];
+	clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+	size_t nameLen;
+	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+		platformName, &nameLen);
+	if (res != CL_SUCCESS) {
+		fprintf(stderr, "Err: Failed to Get Platform Info\n");
+		return;
+	}
+	platformName[nameLen] = 0;
+
+	GetDeviceInfo();
 }
 
-template <typename T>
-void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, std::string str){
-    cl_int err;
-    std::size_t paramValueSize;
-
-    err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);  
-   if(err != CL_SUCCESS)
-   {
-      LOG(ERROR) << "Failed to find OpenCL device info:" << str;
-      return;
-   }
-  
-   std::string content; 
-   T * info = (T *) alloca (sizeof(T) * paramValueSize);
-   err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
-   if(err != CL_SUCCESS)
-   {
-      LOG(ERROR) << "Failed to find OpenCL device info:" << str;
-      return;
-   }
-
-
-   switch(name){
-    case CL_DEVICE_TYPE:
-    {
-        std::string deviceType;
-        appendBitfield<cl_device_type>(
-        *(reinterpret_cast<cl_device_type*>(info)),CL_DEVICE_TYPE_CPU,"CL_DEVICE_TYPE_CPU",deviceType);
-
-        appendBitfield<cl_device_type>(
-        *(reinterpret_cast<cl_device_type*>(info)),CL_DEVICE_TYPE_GPU,"CL_DEVICE_TYPE_GPU",deviceType);
-
-        appendBitfield<cl_device_type>(
-        *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_ACCELERATOR,"CL_DEVICE_TYPE_ACCELERATOR",deviceType);
-
-        appendBitfield<cl_device_type>(
-        *(reinterpret_cast < cl_device_type*>(info)),CL_DEVICE_TYPE_DEFAULT,"CL_DEVICE_TYPE_DEFAULT",deviceType);
-        
-	LOG(INFO) << "\t " << str << ":\t" << deviceType;
-    }
-        break;
-    case CL_DEVICE_EXECUTION_CAPABILITIES:
-    {
-        std::string memType;
-        appendBitfield<cl_device_exec_capabilities>(
-        *(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_EXEC_KERNEL,"CL_EXEC_KERNEL",memType);
-
-        appendBitfield<cl_device_exec_capabilities>(
-        *(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_EXEC_NATIVE_KERNEL,"CL_EXEC_NATIVE_KERNEL",memType);
-
-        LOG(INFO) << "\t " << str << ":\t" << memType;
-
-    }
-       break;
-    case CL_DEVICE_QUEUE_PROPERTIES:
-        {
-            std::string memType;
-            appendBitfield<cl_device_exec_capabilities>(*(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,"CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE",memType);
-
-            appendBitfield<cl_device_exec_capabilities>(*(reinterpret_cast<cl_device_exec_capabilities*>(info)),CL_QUEUE_PROFILING_ENABLE,"CL_QUEUE_PROFILING_ENABLE",memType);
-
-            LOG(INFO) << "\t " << str << ":\t" << memType;
-        }
-        break;
-    default:
-        LOG(INFO) << "\t" << str << ":\t" << *info;
-        break;
-}
+template<typename T>
+void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
+	std::string str) {
+	cl_int err;
+	std::size_t paramValueSize;
+
+	err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);
+	if (err != CL_SUCCESS)
+		{
+		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+		return;
+	}
+
+	std::string content;
+	T * info = (T *) alloca(sizeof(T) * paramValueSize);
+	err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
+	if (err != CL_SUCCESS)
+		{
+		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+		return;
+	}
+
+	switch (name) {
+		case CL_DEVICE_TYPE:
+			{
+			std::string deviceType;
+			appendBitfield < cl_device_type
+				> (
+				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
+
+			appendBitfield < cl_device_type
+				> (
+				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
+
+			appendBitfield < cl_device_type
+				> (
+				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
+
+			appendBitfield < cl_device_type
+				> (
+				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
+
+			LOG(INFO) << "\t " << str << ":\t" << deviceType;
+		}
+			break;
+		case CL_DEVICE_EXECUTION_CAPABILITIES:
+			{
+			std::string memType;
+			appendBitfield < cl_device_exec_capabilities
+				> (
+				*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
+
+			appendBitfield < cl_device_exec_capabilities
+				> (
+				*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
+
+			LOG(INFO) << "\t " << str << ":\t" << memType;
+
+		}
+			break;
+		case CL_DEVICE_QUEUE_PROPERTIES:
+			{
+			std::string memType;
+			appendBitfield < cl_device_exec_capabilities
+				> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
+
+			appendBitfield < cl_device_exec_capabilities
+				> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
+
+			LOG(INFO) << "\t " << str << ":\t" << memType;
+		}
+			break;
+		default:
+			LOG(INFO) << "\t" << str << ":\t" << *info;
+			break;
+	}
 
 }
 
 template<typename T>
-void Device::appendBitfield(T info, T value , std::string name , std::string &str)
-{
-    if(info & value)
-    {
-        if (str.length() > 0)
-        {
-            str.append(" | ");
-        }
-        str.append(name);
-    }
+void Device::appendBitfield(T info, T value, std::string name, std::string &str)
+	{
+	if (info & value)
+		{
+		if (str.length() > 0)
+			{
+			str.append(" | ");
+		}
+		str.append(name);
+	}
 }
 
-
 }  // namespace caffe
 
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index c2d19d43..64f4fa6b 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -4,37 +4,36 @@
 namespace caffe {
 
 InternalThread::~InternalThread() {
-  WaitForInternalThreadToExit();
+	WaitForInternalThreadToExit();
 }
 
 bool InternalThread::is_started() const {
-  return thread_.get() != NULL && thread_->joinable();
+	return thread_.get() != NULL && thread_->joinable();
 }
 
-
 bool InternalThread::StartInternalThread() {
-  if (!WaitForInternalThreadToExit()) {
-    return false;
-  }
-  try {
-    thread_.reset(
-        new boost::thread(&InternalThread::InternalThreadEntry, this));
-  } catch (...) {
-    return false;
-  }
-  return true;
+	if (!WaitForInternalThreadToExit()) {
+		return false;
+	}
+	try {
+		thread_.reset(
+			new boost::thread(&InternalThread::InternalThreadEntry, this));
+	} catch (...) {
+		return false;
+	}
+	return true;
 }
 
 /** Will not return until the internal thread has exited. */
 bool InternalThread::WaitForInternalThreadToExit() {
-  if (is_started()) {
-    try {
-      thread_->join();
-    } catch (...) {
-      return false;
-    }
-  }
-  return true;
+	if (is_started()) {
+		try {
+			thread_->join();
+		} catch (...) {
+			return false;
+		}
+	}
+	return true;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 926c7d8f..4ff6e3d4 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -17,147 +17,147 @@
 namespace caffe {
 
 // Get convolution layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetConvolutionLayer(
-    const LayerParameter& param) {
-  ConvolutionParameter_Engine engine = param.convolution_param().engine();
-  if (engine == ConvolutionParameter_Engine_DEFAULT) {
-    engine = ConvolutionParameter_Engine_CAFFE;
+	const LayerParameter& param) {
+	ConvolutionParameter_Engine engine = param.convolution_param().engine();
+	if (engine == ConvolutionParameter_Engine_DEFAULT) {
+		engine = ConvolutionParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = ConvolutionParameter_Engine_CUDNN;
+		engine = ConvolutionParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == ConvolutionParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
+	}
+	if (engine == ConvolutionParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new ConvolutionLayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
+	} else if (engine == ConvolutionParameter_Engine_CUDNN) {
+		return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
 
 // Get pooling layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
-  PoolingParameter_Engine engine = param.pooling_param().engine();
-  if (engine == PoolingParameter_Engine_DEFAULT) {
-    engine = PoolingParameter_Engine_CAFFE;
+	PoolingParameter_Engine engine = param.pooling_param().engine();
+	if (engine == PoolingParameter_Engine_DEFAULT) {
+		engine = PoolingParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = PoolingParameter_Engine_CUDNN;
+		engine = PoolingParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == PoolingParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+	}
+	if (engine == PoolingParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new PoolingLayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == PoolingParameter_Engine_CUDNN) {
-    PoolingParameter p_param = param.pooling_param();
-    if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
-        param.top_size() > 1) {
-      LOG(INFO) << "CUDNN does not support padding or multiple tops. "
-                << "Using Caffe's own pooling layer.";
-      return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
-    }
-    return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
+	} else if (engine == PoolingParameter_Engine_CUDNN) {
+		PoolingParameter p_param = param.pooling_param();
+		if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
+			param.top_size() > 1) {
+			LOG(INFO) << "CUDNN does not support padding or multiple tops. "
+			<< "Using Caffe's own pooling layer.";
+			return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+		}
+		return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 
 // Get relu layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
-  ReLUParameter_Engine engine = param.relu_param().engine();
-  if (engine == ReLUParameter_Engine_DEFAULT) {
-    engine = ReLUParameter_Engine_CAFFE;
+	ReLUParameter_Engine engine = param.relu_param().engine();
+	if (engine == ReLUParameter_Engine_DEFAULT) {
+		engine = ReLUParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = ReLUParameter_Engine_CUDNN;
+		engine = ReLUParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == ReLUParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new ReLULayer<Dtype>(param));
+	}
+	if (engine == ReLUParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new ReLULayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == ReLUParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
+	} else if (engine == ReLUParameter_Engine_CUDNN) {
+		return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(ReLU, GetReLULayer);
 
 // Get sigmoid layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
-  SigmoidParameter_Engine engine = param.sigmoid_param().engine();
-  if (engine == SigmoidParameter_Engine_DEFAULT) {
-    engine = SigmoidParameter_Engine_CAFFE;
+	SigmoidParameter_Engine engine = param.sigmoid_param().engine();
+	if (engine == SigmoidParameter_Engine_DEFAULT) {
+		engine = SigmoidParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = SigmoidParameter_Engine_CUDNN;
+		engine = SigmoidParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == SigmoidParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new SigmoidLayer<Dtype>(param));
+	}
+	if (engine == SigmoidParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new SigmoidLayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == SigmoidParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
+	} else if (engine == SigmoidParameter_Engine_CUDNN) {
+		return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer);
 
 // Get softmax layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
-  SoftmaxParameter_Engine engine = param.softmax_param().engine();
-  if (engine == SoftmaxParameter_Engine_DEFAULT) {
-    engine = SoftmaxParameter_Engine_CAFFE;
+	SoftmaxParameter_Engine engine = param.softmax_param().engine();
+	if (engine == SoftmaxParameter_Engine_DEFAULT) {
+		engine = SoftmaxParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = SoftmaxParameter_Engine_CUDNN;
+		engine = SoftmaxParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == SoftmaxParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new SoftmaxLayer<Dtype>(param));
+	}
+	if (engine == SoftmaxParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new SoftmaxLayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == SoftmaxParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
+	} else if (engine == SoftmaxParameter_Engine_CUDNN) {
+		return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer);
 
 // Get tanh layer according to engine.
-template <typename Dtype>
+template<typename Dtype>
 shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
-  TanHParameter_Engine engine = param.tanh_param().engine();
-  if (engine == TanHParameter_Engine_DEFAULT) {
-    engine = TanHParameter_Engine_CAFFE;
+	TanHParameter_Engine engine = param.tanh_param().engine();
+	if (engine == TanHParameter_Engine_DEFAULT) {
+		engine = TanHParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = TanHParameter_Engine_CUDNN;
+		engine = TanHParameter_Engine_CUDNN;
 #endif
-  }
-  if (engine == TanHParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new TanHLayer<Dtype>(param));
+	}
+	if (engine == TanHParameter_Engine_CAFFE) {
+		return shared_ptr < Layer<Dtype> > (new TanHLayer<Dtype>(param));
 #ifdef USE_CUDNN
-  } else if (engine == TanHParameter_Engine_CUDNN) {
-    return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
+	} else if (engine == TanHParameter_Engine_CUDNN) {
+		return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
 #endif
-  } else {
-    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-  }
+	} else {
+		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+	}
 }
 
 REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
@@ -165,15 +165,15 @@ REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
 #ifdef WITH_PYTHON_LAYER
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPythonLayer(const LayerParameter& param) {
-  Py_Initialize();
-  try {
-    bp::object module = bp::import(param.python_param().module().c_str());
-    bp::object layer = module.attr(param.python_param().layer().c_str())(param);
-    return bp::extract<shared_ptr<PythonLayer<Dtype> > >(layer)();
-  } catch (bp::error_already_set) {
-    PyErr_Print();
-    throw;
-  }
+	Py_Initialize();
+	try {
+		bp::object module = bp::import(param.python_param().module().c_str());
+		bp::object layer = module.attr(param.python_param().layer().c_str())(param);
+		return bp::extract<shared_ptr<PythonLayer<Dtype> > >(layer)();
+	} catch (bp::error_already_set) {
+		PyErr_Print();
+		throw;
+	}
 }
 
 REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
@@ -181,4 +181,5 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
 
 // Layers that use their constructor as their default creator should be
 // registered in their corresponding cpp files. Do not register them here.
-}  // namespace caffe
+}
+  // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 12776eb8..cd99296e 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -6,61 +6,61 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void AbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
-    "allow in-place computation.";
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
+		"allow in-place computation.";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AbsValLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  caffe_abs(count, bottom[0]->cpu_data(), top_data);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	const int count = top[0]->count();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	caffe_abs(count, bottom[0]->cpu_data(), top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->cpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    caffe_cpu_sign(count, bottom_data, bottom_diff);
-    caffe_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const int count = top[0]->count();
+	const Dtype* top_diff = top[0]->cpu_diff();
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		caffe_cpu_sign(count, bottom_data, bottom_diff);
+		caffe_mul(count, bottom_diff, top_diff, bottom_diff);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
+	const vector<Blob<Dtype>*>& top) {
+	const int count = top[0]->count();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_sign(count, bottom_data, bottom_diff);
-    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const int count = top[0]->count();
+	const Dtype* top_diff = top[0]->gpu_diff();
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->gpu_data();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		caffe_gpu_sign(count, bottom_data, bottom_diff);
+		caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(AbsValLayer);
 #endif
 
-INSTANTIATE_CLASS(AbsValLayer);
-REGISTER_LAYER_CLASS(AbsVal);
+INSTANTIATE_CLASS (AbsValLayer);
+REGISTER_LAYER_CLASS (AbsVal);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 90aad675..82f92e27 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -10,82 +10,82 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void AccuracyLayer<Dtype>::LayerSetUp(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  top_k_ = this->layer_param_.accuracy_param().top_k();
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	top_k_ = this->layer_param_.accuracy_param().top_k();
 
-  has_ignore_label_ =
-    this->layer_param_.accuracy_param().has_ignore_label();
-  if (has_ignore_label_) {
-    ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
-  }
+	has_ignore_label_ =
+		this->layer_param_.accuracy_param().has_ignore_label();
+	if (has_ignore_label_) {
+		ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AccuracyLayer<Dtype>::Reshape(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
-      << "top_k must be less than or equal to the number of classes.";
-  label_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
-  outer_num_ = bottom[0]->count(0, label_axis_);
-  inner_num_ = bottom[0]->count(label_axis_ + 1);
-  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-      << "Number of labels must match number of predictions; "
-      << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
-      << "label count (number of labels) must be N*H*W, "
-      << "with integer values in {0, 1, ..., C-1}.";
-  vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
-  top[0]->Reshape(top_shape);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
+		<< "top_k must be less than or equal to the number of classes.";
+	label_axis_ =
+		bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
+	outer_num_ = bottom[0]->count(0, label_axis_);
+	inner_num_ = bottom[0]->count(label_axis_ + 1);
+	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+		<< "Number of labels must match number of predictions; "
+		<< "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
+		<< "label count (number of labels) must be N*H*W, "
+		<< "with integer values in {0, 1, ..., C-1}.";
+	vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
+	top[0]->Reshape(top_shape);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype accuracy = 0;
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
-  const int dim = bottom[0]->count() / outer_num_;
-  const int num_labels = bottom[0]->shape(label_axis_);
-  vector<Dtype> maxval(top_k_+1);
-  vector<int> max_id(top_k_+1);
-  int count = 0;
-  for (int i = 0; i < outer_num_; ++i) {
-    for (int j = 0; j < inner_num_; ++j) {
-      const int label_value =
-          static_cast<int>(bottom_label[i * inner_num_ + j]);
-      if (has_ignore_label_ && label_value == ignore_label_) {
-        continue;
-      }
-      DCHECK_GE(label_value, 0);
-      DCHECK_LT(label_value, num_labels);
-      // Top-k accuracy
-      std::vector<std::pair<Dtype, int> > bottom_data_vector;
-      for (int k = 0; k < num_labels; ++k) {
-        bottom_data_vector.push_back(std::make_pair(
-            bottom_data[i * dim + k * inner_num_ + j], k));
-      }
-      std::partial_sort(
-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-      // check if true label is in top k predictions
-      for (int k = 0; k < top_k_; k++) {
-        if (bottom_data_vector[k].second == label_value) {
-          ++accuracy;
-          break;
-        }
-      }
-      ++count;
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	Dtype accuracy = 0;
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* bottom_label = bottom[1]->cpu_data();
+	const int dim = bottom[0]->count() / outer_num_;
+	const int num_labels = bottom[0]->shape(label_axis_);
+	vector < Dtype > maxval(top_k_ + 1);
+	vector<int> max_id(top_k_ + 1);
+	int count = 0;
+	for (int i = 0; i < outer_num_; ++i) {
+		for (int j = 0; j < inner_num_; ++j) {
+			const int label_value =
+				static_cast<int>(bottom_label[i * inner_num_ + j]);
+			if (has_ignore_label_ && label_value == ignore_label_) {
+				continue;
+			}
+			DCHECK_GE(label_value, 0);
+			DCHECK_LT(label_value, num_labels);
+			// Top-k accuracy
+			std::vector < std::pair<Dtype, int> > bottom_data_vector;
+			for (int k = 0; k < num_labels; ++k) {
+				bottom_data_vector.push_back(std::make_pair(
+					bottom_data[i * dim + k * inner_num_ + j], k));
+			}
+			std::partial_sort(
+				bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
+				bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+			// check if true label is in top k predictions
+			for (int k = 0; k < top_k_; k++) {
+				if (bottom_data_vector[k].second == label_value) {
+					++accuracy;
+					break;
+				}
+			}
+			++count;
+		}
+	}
 
-  // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = accuracy / count;
-  // Accuracy layer should not be used as a loss function.
+	// LOG(INFO) << "Accuracy: " << accuracy;
+	top[0]->mutable_cpu_data()[0] = accuracy / count;
+	// Accuracy layer should not be used as a loss function.
 }
 
-INSTANTIATE_CLASS(AccuracyLayer);
-REGISTER_LAYER_CLASS(Accuracy);
+INSTANTIATE_CLASS (AccuracyLayer);
+REGISTER_LAYER_CLASS (Accuracy);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp
index c4040cdc..87cc706e 100644
--- a/src/caffe/layers/argmax_layer.cpp
+++ b/src/caffe/layers/argmax_layer.cpp
@@ -8,56 +8,56 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  out_max_val_ = this->layer_param_.argmax_param().out_max_val();
-  top_k_ = this->layer_param_.argmax_param().top_k();
-  CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
-  CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
-      << "top_k must be less than or equal to the number of classes.";
+	const vector<Blob<Dtype>*>& top) {
+	out_max_val_ = this->layer_param_.argmax_param().out_max_val();
+	top_k_ = this->layer_param_.argmax_param().top_k();
+	CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
+	CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
+		<< "top_k must be less than or equal to the number of classes.";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (out_max_val_) {
-    // Produces max_ind and max_val
-    top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
-  } else {
-    // Produces only max_ind
-    top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	if (out_max_val_) {
+		// Produces max_ind and max_val
+		top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
+	} else {
+		// Produces only max_ind
+		top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
-  for (int i = 0; i < num; ++i) {
-    std::vector<std::pair<Dtype, int> > bottom_data_vector;
-    for (int j = 0; j < dim; ++j) {
-      bottom_data_vector.push_back(
-          std::make_pair(bottom_data[i * dim + j], j));
-    }
-    std::partial_sort(
-        bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-        bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-    for (int j = 0; j < top_k_; ++j) {
-      top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
-    }
-    if (out_max_val_) {
-      for (int j = 0; j < top_k_; ++j) {
-        top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first;
-      }
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	int num = bottom[0]->num();
+	int dim = bottom[0]->count() / bottom[0]->num();
+	for (int i = 0; i < num; ++i) {
+		std::vector < std::pair<Dtype, int> > bottom_data_vector;
+		for (int j = 0; j < dim; ++j) {
+			bottom_data_vector.push_back(
+				std::make_pair(bottom_data[i * dim + j], j));
+		}
+		std::partial_sort(
+			bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
+			bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+		for (int j = 0; j < top_k_; ++j) {
+			top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
+		}
+		if (out_max_val_) {
+			for (int j = 0; j < top_k_; ++j) {
+				top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first;
+			}
+		}
+	}
 }
 
-INSTANTIATE_CLASS(ArgMaxLayer);
-REGISTER_LAYER_CLASS(ArgMax);
+INSTANTIATE_CLASS (ArgMaxLayer);
+REGISTER_LAYER_CLASS (ArgMax);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 394fd9a5..97c9afd3 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -11,447 +11,468 @@ namespace caffe {
 
 #ifdef use_packing_scheme
 template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::subtop_mem_size = sizeof(Dtype);
-template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::trans_mem_size =  sizeof(Dtype);
+template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::trans_mem_size = sizeof(Dtype);
 template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::subtop_mem_size, NULL, NULL);
 template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
 #endif
 
-template <typename Dtype>
+template<typename Dtype>
 void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
-{
-  if(subtop_size > BaseConvolutionLayer<Dtype>::subtop_mem_size){
-      ConvolutionLayer<Dtype>::subtop_mem_size = subtop_size;
-      clReleaseMemObject(ConvolutionLayer<Dtype>::subTopMem);
-      ConvolutionLayer<Dtype>::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::subtop_mem_size, NULL, NULL);
-  }
-  if(trans_size > ConvolutionLayer<Dtype>::trans_mem_size){
-      ConvolutionLayer<Dtype>::trans_mem_size =  trans_size;
-      clReleaseMemObject(ConvolutionLayer<Dtype>::transMem);
-      ConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
-  }
+	{
+	if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) {
+		ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size;
+		clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem);
+		ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context,
+			CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, NULL,
+			NULL);
+	}
+	if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) {
+		ConvolutionLayer < Dtype > ::trans_mem_size = trans_size;
+		clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem);
+		ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context,
+			CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, NULL,
+			NULL);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-  M_ = num_output_ / group_;
-  K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
-  N_ = height_out_ * width_out_;
+	M_ = num_output_ / group_;
+	K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
+	N_ = height_out_ * width_out_;
 #ifdef use_packing_scheme
-  size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
-  size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
-  Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
+	size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
+	size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
+	Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
 #endif
 }
 
-
-template <typename Dtype>
- BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer(){
+template<typename Dtype>
+BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer() {
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  // Configure the kernel size, padding, stride, and inputs.
-  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  if (conv_param.has_kernel_size()) {
-    kernel_h_ = kernel_w_ = conv_param.kernel_size();
-  } else {
-    kernel_h_ = conv_param.kernel_h();
-    kernel_w_ = conv_param.kernel_w();
-  }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!conv_param.has_pad_h()) {
-    pad_h_ = pad_w_ = conv_param.pad();
-  } else {
-    pad_h_ = conv_param.pad_h();
-    pad_w_ = conv_param.pad_w();
-  }
-  if (!conv_param.has_stride_h()) {
-    stride_h_ = stride_w_ = conv_param.stride();
-  } else {
-    stride_h_ = conv_param.stride_h();
-    stride_w_ = conv_param.stride_w();
-  }
-  // Special case: im2col is the identity for 1x1 convolution with stride 1
-  // and no padding, so flag for skipping the buffer and transformation.
-  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
-      && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
-  // Configure output channels and groups.
-  channels_ = bottom[0]->channels();
-  num_output_ = this->layer_param_.convolution_param().num_output();
-  CHECK_GT(num_output_, 0);
-  group_ = this->layer_param_.convolution_param().group();
-  CHECK_EQ(channels_ % group_, 0);
-  CHECK_EQ(num_output_ % group_, 0)
-      << "Number of output should be multiples of group.";
-  if (reverse_dimensions()) {
-    conv_out_channels_ = channels_;
-    conv_in_channels_ = num_output_;
-  } else {
-    conv_out_channels_ = num_output_;
-    conv_in_channels_ = channels_;
-  }
-
-
-  // Handle the parameters: weights and biases.
-  // - blobs_[0] holds the filter weights
-  // - blobs_[1] holds the biases (optional)
-  bias_term_ = this->layer_param_.convolution_param().bias_term();
-  if (this->blobs_.size() > 0) {
-    LOG(INFO) << "Skipping parameter initialization";
-  } else {
-    if (bias_term_) {
-      this->blobs_.resize(2);
-    } else {
-      this->blobs_.resize(1);
-    }
-    // Initialize and fill the weights:
-    // output channels x input channels per-group x kernel height x kernel width
-    this->blobs_[0].reset(new Blob<Dtype>(
-        conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
-    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.convolution_param().weight_filler()));
-    weight_filler->Fill(this->blobs_[0].get());
-    // If necessary, initialize and fill the biases.
-    if (bias_term_) {
-      vector<int> bias_shape(1, num_output_);
-      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.convolution_param().bias_filler()));
-      bias_filler->Fill(this->blobs_[1].get());
-    }
-  }
-  // Propagate gradients to the parameters (as directed by backward pass).
-  this->param_propagate_down_.resize(this->blobs_.size(), true);
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	// Configure the kernel size, padding, stride, and inputs.
+	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+	CHECK(!conv_param.has_kernel_size() !=
+		!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+		<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+	CHECK(conv_param.has_kernel_size() ||
+		(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+		<< "For non-square filters both kernel_h and kernel_w are required.";
+	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+		&& conv_param.has_pad_w())
+		|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+		<< "pad is pad OR pad_h and pad_w are required.";
+	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+		&& conv_param.has_stride_w())
+		|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+		<< "Stride is stride OR stride_h and stride_w are required.";
+	if (conv_param.has_kernel_size()) {
+		kernel_h_ = kernel_w_ = conv_param.kernel_size();
+	} else {
+		kernel_h_ = conv_param.kernel_h();
+		kernel_w_ = conv_param.kernel_w();
+	}
+	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+	if (!conv_param.has_pad_h()) {
+		pad_h_ = pad_w_ = conv_param.pad();
+	} else {
+		pad_h_ = conv_param.pad_h();
+		pad_w_ = conv_param.pad_w();
+	}
+	if (!conv_param.has_stride_h()) {
+		stride_h_ = stride_w_ = conv_param.stride();
+	} else {
+		stride_h_ = conv_param.stride_h();
+		stride_w_ = conv_param.stride_w();
+	}
+	// Special case: im2col is the identity for 1x1 convolution with stride 1
+	// and no padding, so flag for skipping the buffer and transformation.
+	is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
+		&& stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
+	// Configure output channels and groups.
+	channels_ = bottom[0]->channels();
+	num_output_ = this->layer_param_.convolution_param().num_output();
+	CHECK_GT(num_output_, 0);
+	group_ = this->layer_param_.convolution_param().group();
+	CHECK_EQ(channels_ % group_, 0);
+	CHECK_EQ(num_output_ % group_, 0)
+		<< "Number of output should be multiples of group.";
+	if (reverse_dimensions()) {
+		conv_out_channels_ = channels_;
+		conv_in_channels_ = num_output_;
+	} else {
+		conv_out_channels_ = num_output_;
+		conv_in_channels_ = channels_;
+	}
+
+	// Handle the parameters: weights and biases.
+	// - blobs_[0] holds the filter weights
+	// - blobs_[1] holds the biases (optional)
+	bias_term_ = this->layer_param_.convolution_param().bias_term();
+	if (this->blobs_.size() > 0) {
+		LOG(INFO) << "Skipping parameter initialization";
+	} else {
+		if (bias_term_) {
+			this->blobs_.resize(2);
+		} else {
+			this->blobs_.resize(1);
+		}
+		// Initialize and fill the weights:
+		// output channels x input channels per-group x kernel height x kernel width
+		this->blobs_[0].reset(new Blob<Dtype>(
+			conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
+		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
+			this->layer_param_.convolution_param().weight_filler()));
+		weight_filler->Fill(this->blobs_[0].get());
+		// If necessary, initialize and fill the biases.
+		if (bias_term_) {
+			vector<int> bias_shape(1, num_output_);
+			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
+				this->layer_param_.convolution_param().bias_filler()));
+			bias_filler->Fill(this->blobs_[1].get());
+		}
+	}
+	// Propagate gradients to the parameters (as directed by backward pass).
+	this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  num_ = bottom[0]->num();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-    " convolution kernel.";
-  // TODO: generalize to handle inputs of different shapes.
-  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
-    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
-    CHECK_EQ(channels_, bottom[bottom_id]->channels())
-        << "Inputs must have same channels.";
-    CHECK_EQ(height_, bottom[bottom_id]->height())
-        << "Inputs must have same height.";
-    CHECK_EQ(width_, bottom[bottom_id]->width())
-        << "Inputs must have same width.";
-  }
-  // Shape the tops.
-  compute_output_shape();
-  for (int top_id = 0; top_id < top.size(); ++top_id) {
-    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
-  }
-  if (reverse_dimensions()) {
-    conv_in_height_ = height_out_;
-    conv_in_width_ = width_out_;
-    conv_out_spatial_dim_ = height_ * width_;
-  } else {
-    conv_in_height_ = height_;
-    conv_in_width_ = width_;
-    conv_out_spatial_dim_ = height_out_ * width_out_;
-  }
-  kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
-  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
-  col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
-  output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
-  // The im2col result buffer will only hold one image at a time to avoid
-  // overly large memory usage. In the special case of 1x1 convolution
-  // it goes lazily unused to save memory.
-  if (reverse_dimensions()) {
-    col_buffer_.Reshape(1, kernel_dim_, height_, width_);
-  } else {
-    col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
-  }
-  // Set up the all ones "bias multiplier" for adding biases by BLAS
-  if (bias_term_) {
-    vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
-    bias_multiplier_.Reshape(bias_multiplier_shape);
-    caffe_set(bias_multiplier_.count(), Dtype(1),
-        bias_multiplier_.mutable_cpu_data());
-  }
-  //initializa OpenCL kernels and cl_mem objects
-    ocl_setup();
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	num_ = bottom[0]->num();
+	height_ = bottom[0]->height();
+	width_ = bottom[0]->width();
+	CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+		" convolution kernel.";
+	// TODO: generalize to handle inputs of different shapes.
+	for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+		CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+		CHECK_EQ(channels_, bottom[bottom_id]->channels())
+			<< "Inputs must have same channels.";
+		CHECK_EQ(height_, bottom[bottom_id]->height())
+			<< "Inputs must have same height.";
+		CHECK_EQ(width_, bottom[bottom_id]->width())
+			<< "Inputs must have same width.";
+	}
+	// Shape the tops.
+	compute_output_shape();
+	for (int top_id = 0; top_id < top.size(); ++top_id) {
+		top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+	}
+	if (reverse_dimensions()) {
+		conv_in_height_ = height_out_;
+		conv_in_width_ = width_out_;
+		conv_out_spatial_dim_ = height_ * width_;
+	} else {
+		conv_in_height_ = height_;
+		conv_in_width_ = width_;
+		conv_out_spatial_dim_ = height_out_ * width_out_;
+	}
+	kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
+	weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
+	col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
+	output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+	// The im2col result buffer will only hold one image at a time to avoid
+	// overly large memory usage. In the special case of 1x1 convolution
+	// it goes lazily unused to save memory.
+	if (reverse_dimensions()) {
+		col_buffer_.Reshape(1, kernel_dim_, height_, width_);
+	} else {
+		col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
+	}
+	// Set up the all ones "bias multiplier" for adding biases by BLAS
+	if (bias_term_) {
+		vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
+		bias_multiplier_.Reshape(bias_multiplier_shape);
+		caffe_set(bias_multiplier_.count(), Dtype(1),
+			bias_multiplier_.mutable_cpu_data());
+	}
+	//initializa OpenCL kernels and cl_mem objects
+	ocl_setup();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
-    const Dtype* weights, Dtype* output, bool skip_im2col) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    if (!skip_im2col) {
-      conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
-    }
-    col_buff = col_buffer_.cpu_data();
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)0., output + output_offset_ * g);
-  }
+	const Dtype* weights, Dtype* output, bool skip_im2col) {
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		if (!skip_im2col) {
+			conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
+		}
+		col_buff = col_buffer_.cpu_data();
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ /
+			group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+			(Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
+			(Dtype) 0., output + output_offset_ * g);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
-    const Dtype* bias) {
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
-      (Dtype)1., output);
+	const Dtype* bias) {
+	caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
+		height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(),
+		(Dtype) 1., output);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
-    const Dtype* weights, Dtype* input) {
-  Dtype* col_buff = col_buffer_.mutable_cpu_data();
-  if (is_1x1_) {
-    col_buff = input;
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-        conv_out_spatial_dim_, conv_out_channels_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
-        (Dtype)0., col_buff + col_offset_ * g);
-  }
-  if (!is_1x1_) {
-    conv_col2im_cpu(col_buff, input);
-  }
+	const Dtype* weights, Dtype* input) {
+	Dtype* col_buff = col_buffer_.mutable_cpu_data();
+	if (is_1x1_) {
+		col_buff = input;
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_,
+			conv_out_spatial_dim_, conv_out_channels_ / group_,
+			(Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g,
+			(Dtype) 0., col_buff + col_offset_ * g);
+	}
+	if (!is_1x1_) {
+		conv_col2im_cpu(col_buff, input);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
-    const Dtype* output, Dtype* weights) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
-    col_buff = col_buffer_.cpu_data();
-  }
-  for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
-        (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);
-  }
+	const Dtype* output, Dtype* weights) {
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
+		col_buff = col_buffer_.cpu_data();
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_cpu_gemm < Dtype
+			> (CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
+				kernel_dim_ / group_, conv_out_spatial_dim_,
+				(Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g,
+				(Dtype) 1., weights + weight_offset_ * g);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
-    const Dtype* input) {
-  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.cpu_data(), 1., bias);
+	const Dtype* input) {
+	caffe_cpu_gemv < Dtype
+		> (CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
+			input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
 #ifndef CPU_ONLY
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
-    const Dtype* weights, Dtype* output, bool skip_im2col) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    if (!skip_im2col) {
-      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-    }
-    col_buff = col_buffer_.gpu_data();
-  }
-  
-  for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
-          conv_out_channels_/group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
-        (Dtype)0., output,  top_offset_+output_offset_ * g);
-   }
+	const Dtype* weights, Dtype* output, bool skip_im2col) {
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		if (!skip_im2col) {
+			conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+		}
+		col_buff = col_buffer_.gpu_data();
+	}
+
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+			> (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
+				conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
+					/ group_,
+				(Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
+				(Dtype) 0., output, top_offset_ + output_offset_ * g);
+	}
 }
 
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt (const Dtype* input,
-    const Dtype* weight, Dtype* output, bool skip_im2col) {
-  cl_command_queue Queue;
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    if (!skip_im2col) {
-      conv_im2col_gpu_opt(input);
-    }   
-    col_buff = col_buffer_.gpu_data();
-   }else{
-    caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff, (Dtype*)transMem);
-  }
+template<typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
+	const Dtype* weight, Dtype* output, bool skip_im2col) {
+	cl_command_queue Queue;
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		if (!skip_im2col) {
+			conv_im2col_gpu_opt(input);
+		}
+		col_buff = col_buffer_.gpu_data();
+	} else {
+		caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff,
+			(Dtype*) transMem);
+	}
 #ifdef multiQ
-    for (int g = 0; g < group_; ++g) {
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
-       }
-     if(group_ == 2){
-       clFinish(amdDevice.CommandQueue);
-       clFinish(amdDevice.CommandQueue_helper);
-     }
+	for (int g = 0; g < group_; ++g) {
+		if(g == 0) Queue = amdDevice.CommandQueue;
+		else Queue = amdDevice.CommandQueue_helper;
+		caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+			(Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+			(Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
+	}
+	if(group_ == 2) {
+		clFinish(amdDevice.CommandQueue);
+		clFinish(amdDevice.CommandQueue_helper);
+	}
 #else
-    Queue = amdDevice.CommandQueue;
-    for (int g = 0; g < group_; ++g) {
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-          (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-          (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
-       }
+	Queue = amdDevice.CommandQueue;
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+			> (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+				(Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_
+					* g,
+				(Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g);
+	}
 #endif
-   transform_gpu((Dtype*)subTopMem, output, top_offset_, N_, M_*group_, opt_num2);
+	transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_,
+		opt_num2);
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
-    const Dtype* bias) {
-     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          height_out_*width_out_, 1, (Dtype)1., bias, 0,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., output, top_offset_);
+	const Dtype* bias) {
+	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
+		height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
+		reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+		(Dtype) 1., output, top_offset_);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
-    const Dtype* bias) {
-   for (int z = 0; z < opt_num2; z++)
-      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-          N_, 1, (Dtype)1., bias, 0,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-          (Dtype)1., output, top_offset_ + num_output_ * N_ * z);
+	const Dtype* bias) {
+	for (int z = 0; z < opt_num2; z++)
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
+			N_, 1, (Dtype) 1., bias, 0,
+			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+			(Dtype) 1., output, top_offset_ + num_output_ * N_ * z);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
-    const Dtype* weights, Dtype* input) {
-  Dtype* col_buff = col_buffer_.mutable_gpu_data();
-  if (is_1x1_) {
-    col_buff = input;
-  }
-  for (int g = 0; g < group_; ++g) {
-        caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
-          (Dtype)1., weights,  weight_offset_ * g,
-          output, top_offset_ + output_offset_ * g,
-          (Dtype)0., col_buff, col_offset_ * g);
-  }
-  if (!is_1x1_) {
-      conv_col2im_gpu(col_buff, input);
-  }
+	const Dtype* weights, Dtype* input) {
+	Dtype* col_buff = col_buffer_.mutable_gpu_data();
+	if (is_1x1_) {
+		col_buff = input;
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+			> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+				/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
+				(Dtype) 1., weights, weight_offset_ * g,
+				output, top_offset_ + output_offset_ * g,
+				(Dtype) 0., col_buff, col_offset_ * g);
+	}
+	if (!is_1x1_) {
+		conv_col2im_gpu(col_buff, input);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
-    const Dtype* weights, Dtype* input) {
-  cl_command_queue Queue;
-  if (is_1x1_) {
-    caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem);
-  }
-  for (int g = 0; g < group_; ++g) {
+	const Dtype* weights, Dtype* input) {
+	cl_command_queue Queue;
+	if (is_1x1_) {
+		caffe_gpu_memcpy(
+			height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
+			(Dtype*) transMem);
+	}
+	for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
+		if(g == 0) Queue = amdDevice.CommandQueue;
+		else Queue = amdDevice.CommandQueue_helper;
 #else
-       Queue =  amdDevice.CommandQueue;
+		Queue = amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
-          (Dtype)1., weights,  weight_offset_ * g,
-          (Dtype*)subTopMem, top_offset_opt * g,
-          (Dtype)0., (Dtype*)transMem, col_offset_ * g);
-      }
+		caffe_gpu_gemm < Dtype
+			> (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
+				(Dtype) 1., weights, weight_offset_ * g,
+				(Dtype*) subTopMem, top_offset_opt * g,
+				(Dtype) 0., (Dtype*) transMem, col_offset_ * g);
+	}
 #ifdef multiQ
-   if(group_ ==2){
-      clFinish(amdDevice.CommandQueue);
-      clFinish(amdDevice.CommandQueue_helper);
-    }
+	if(group_ ==2) {
+		clFinish(amdDevice.CommandQueue);
+		clFinish(amdDevice.CommandQueue_helper);
+	}
 #endif
 
-  if (!is_1x1_) {
-      conv_col2im_gpu_opt(input);
-   }else{
-     caffe_gpu_memcpy( height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), (Dtype*)transMem, input);
-   }
+	if (!is_1x1_) {
+		conv_col2im_gpu_opt(input);
+	} else {
+		caffe_gpu_memcpy(
+			height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
+			(Dtype*) transMem, input);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
-    const Dtype* output, Dtype* weights) {
-  const Dtype* col_buff = input;
-  if (!is_1x1_) {
-    conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-    col_buff = col_buffer_.gpu_data();
-  }
-  for (int g = 0; g < group_; ++g) {
-      caffe_gpu_gemm<Dtype>(&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ / group_, conv_out_spatial_dim_,
-        (Dtype)1., output, top_offset_,
-        (Dtype*)col_buff, col_offset_ * g, (Dtype)1.,
-        (Dtype*)weights, weight_offset_ * g);
- }
+	const Dtype* output, Dtype* weights) {
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+		col_buff = col_buffer_.gpu_data();
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+			> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
+				/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
+				(Dtype) 1., output, top_offset_,
+				(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
+				(Dtype*) weights, weight_offset_ * g);
+	}
 }
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
-    const Dtype* output, Dtype* weights) {
-  cl_command_queue Queue;
-  if (!is_1x1_) {
-    conv_im2col_gpu_opt(input);
-  }else{
-    caffe_gpu_memcpy( K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input, (Dtype*)transMem);
- }
-    opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*)subTopMem, 0, opt_num2);
-
-  for (int g = 0; g < group_; ++g) {
+	const Dtype* output, Dtype* weights) {
+	cl_command_queue Queue;
+	if (!is_1x1_) {
+		conv_im2col_gpu_opt(input);
+	} else {
+		caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input,
+			(Dtype*) transMem);
+	}
+	opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+		opt_num2);
+
+	for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
-       if(g == 0) Queue = amdDevice.CommandQueue;
-       else Queue =  amdDevice.CommandQueue_helper;
+		if(g == 0) Queue = amdDevice.CommandQueue;
+		else Queue = amdDevice.CommandQueue_helper;
 #else
-       Queue =  amdDevice.CommandQueue;
+		Queue = amdDevice.CommandQueue;
 #endif
-       caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
-        (Dtype)1., (Dtype*)subTopMem, top_offset_opt * g,
-        (Dtype*)transMem, col_offset_ * g, (Dtype)1.,
-        (Dtype*)weights, weight_offset_ * g);
+		caffe_gpu_gemm < Dtype
+			> (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+				(Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g,
+				(Dtype*) transMem, col_offset_ * g, (Dtype) 1.,
+				(Dtype*) weights, weight_offset_ * g);
 #ifdef multiQ
-     if(group_ == 2){
-       clFinish(amdDevice.CommandQueue);
-       clFinish(amdDevice.CommandQueue_helper);
-     }
+		if(group_ == 2) {
+			clFinish(amdDevice.CommandQueue);
+			clFinish(amdDevice.CommandQueue_helper);
+		}
 #endif
-    }
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
-    const Dtype* input) {
-      caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, N_, 
-          (Dtype)1., input, top_offset_, N_,
-          reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t)0, (Dtype)1., 1,
-          bias, (size_t)0, 1);
+	const Dtype* input) {
+	caffe_gpu_gemv < Dtype
+		> (CblasNoTrans, num_output_, N_,
+			(Dtype) 1., input, top_offset_, N_,
+			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
+			bias, (size_t) 0, 1);
 }
 
 #endif  // !CPU_ONLY
 
-INSTANTIATE_CLASS(BaseConvolutionLayer);
+INSTANTIATE_CLASS (BaseConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 5ba0f2e5..f9a80979 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -7,109 +7,118 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 BaseDataLayer<Dtype>::BaseDataLayer(const LayerParameter& param)
-    : Layer<Dtype>(param),
-      transform_param_(param.transform_param()) {
+	: Layer<Dtype>(param),
+		transform_param_(param.transform_param()) {
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (top.size() == 1) {
-    output_labels_ = false;
-  } else {
-    output_labels_ = true;
-  }
-  data_transformer_.reset(
-      new DataTransformer<Dtype>(transform_param_, this->phase_));
-  data_transformer_->InitRand();
-  // The subclasses should setup the size of bottom and top
-  DataLayerSetUp(bottom, top);
+	const vector<Blob<Dtype>*>& top) {
+	if (top.size() == 1) {
+		output_labels_ = false;
+	} else {
+		output_labels_ = true;
+	}
+	data_transformer_.reset(
+		new DataTransformer<Dtype>(transform_param_, this->phase_));
+	data_transformer_->InitRand();
+	// The subclasses should setup the size of bottom and top
+	DataLayerSetUp(bottom, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
-  // Now, start the prefetch thread. Before calling prefetch, we make two
-  // cpu_data calls so that the prefetch thread does not accidentally make
-  // simultaneous cudaMalloc calls when the main thread is running. In some
-  // GPUs this seems to cause failures if we do not so.
-  this->prefetch_data_.mutable_cpu_data();
-  if (this->output_labels_) {
-    this->prefetch_label_.mutable_cpu_data();
-  }
-  DLOG(INFO) << "Initializing prefetch";
-  this->CreatePrefetchThread();
-  DLOG(INFO) << "Prefetch initialized.";
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	BaseDataLayer < Dtype > ::LayerSetUp(bottom, top);
+	// Now, start the prefetch thread. Before calling prefetch, we make two
+	// cpu_data calls so that the prefetch thread does not accidentally make
+	// simultaneous cudaMalloc calls when the main thread is running. In some
+	// GPUs this seems to cause failures if we do not so.
+	this->prefetch_data_.mutable_cpu_data();
+	if (this->output_labels_) {
+		this->prefetch_label_.mutable_cpu_data();
+	}
+	DLOG(INFO) << "Initializing prefetch";
+	this->CreatePrefetchThread();
+	DLOG(INFO) << "Prefetch initialized.";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::CreatePrefetchThread() {
-  this->data_transformer_->InitRand();
-  CHECK(StartInternalThread()) << "Thread execution failed";
+	this->data_transformer_->InitRand();
+	CHECK(StartInternalThread()) << "Thread execution failed";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::JoinPrefetchThread() {
-  CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
+	CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
-
-  DLOG(INFO) << "Thread joined";
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(prefetch_data_);
-  // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-             top[0]->mutable_cpu_data());
-  DLOG(INFO) << "Prefetch copied";
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
-    // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-               top[1]->mutable_cpu_data());
-  }
-  // Start a new prefetch thread
-  DLOG(INFO) << "CreatePrefetchThread";
-  CreatePrefetchThread();
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// First, join the thread
+	JoinPrefetchThread();
+
+	DLOG(INFO) << "Thread joined";
+	// Reshape to loaded data.
+	top[0]->ReshapeLike(prefetch_data_);
+	// Copy the data
+	caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
+		top[0]->mutable_cpu_data());
+	DLOG(INFO) << "Prefetch copied";
+	if (this->output_labels_) {
+		// Reshape to loaded labels.
+		top[1]->ReshapeLike(prefetch_label_);
+		// Copy the labels.
+		caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
+			top[1]->mutable_cpu_data());
+	}
+	// Start a new prefetch thread
+	DLOG(INFO) << "CreatePrefetchThread";
+	CreatePrefetchThread();
 }
 
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-     const  vector<Blob<Dtype>*>& top) {
-  
-  JoinPrefetchThread();
-  DLOG(INFO) << "Thread joined";
-  
-   top[0]->ReshapeLike(this->prefetch_data_);
-   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[0]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_data_.count(), prefetch_data_.cpu_data(), 0, NULL, NULL) );
-    DLOG(INFO) << "Prefetch copied"; 
- if (this->output_labels_) {
-       // Reshape to loaded labels.
-   top[1]->ReshapeLike(prefetch_label_);
-   OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[1]->mutable_gpu_data(), CL_TRUE, 0, sizeof(Dtype)*prefetch_label_.count(), prefetch_label_.cpu_data(), 0, NULL, NULL) );
-   }
-  
+template<typename Dtype>
+void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
+	const vector<Blob<Dtype>*>& bottom,
+	const vector<Blob<Dtype>*>& top) {
+
+	JoinPrefetchThread();
+	DLOG(INFO) << "Thread joined";
+
+	top[0]->ReshapeLike(this->prefetch_data_);
+	OCL_CHECK(
+		clEnqueueWriteBuffer(amdDevice.CommandQueue,
+			(cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
+			sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
+			NULL, NULL));
+	DLOG(INFO) << "Prefetch copied";
+	if (this->output_labels_) {
+		// Reshape to loaded labels.
+		top[1]->ReshapeLike(prefetch_label_);
+		OCL_CHECK(
+			clEnqueueWriteBuffer(amdDevice.CommandQueue,
+				(cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
+				sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), 0,
+				NULL, NULL));
+	}
+
 #ifdef Track_data_transfer
 #endif
-  
-  // Start a new prefetch thread
-  DLOG(INFO) << "CreatePrefetchThread";
-  CreatePrefetchThread();
+
+	// Start a new prefetch thread
+	DLOG(INFO) << "CreatePrefetchThread";
+	CreatePrefetchThread();
 }
 
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(BaseDataLayer);
-INSTANTIATE_CLASS(BasePrefetchingDataLayer);
+INSTANTIATE_CLASS (BaseDataLayer);
+INSTANTIATE_CLASS (BasePrefetchingDataLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 3fe6f42e..8f72f41b 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -8,65 +8,66 @@ namespace caffe {
 
 const float kBNLL_THRESHOLD = 50.;
 
-template <typename Dtype>
+template<typename Dtype>
 void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = bottom_data[i] > 0 ?
-        bottom_data[i] + log(1. + exp(-bottom_data[i])) :
-        log(1. + exp(bottom_data[i]));
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	for (int i = 0; i < count; ++i) {
+		top_data[i] =
+			bottom_data[i] > 0 ?
+														bottom_data[i] + log(1. + exp(-bottom_data[i])) :
+														log(1. + exp(bottom_data[i]));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const int count = bottom[0]->count();
-    Dtype expval;
-    for (int i = 0; i < count; ++i) {
-      expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
-      bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		const Dtype* top_diff = top[0]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const int count = bottom[0]->count();
+		Dtype expval;
+		for (int i = 0; i < count; ++i) {
+			expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
+			bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BNLLForward(count, bottom_data, top_data);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	BNLLForward(count, bottom_data, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    BNLLBackward(count, top_diff, bottom_data, bottom_diff);
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->gpu_data();
+		const Dtype* top_diff = top[0]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const int count = bottom[0]->count();
+		// NOLINT_NEXT_LINE(whitespace/operators)
+		BNLLBackward(count, top_diff, bottom_data, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(BNLLLayer);
 #endif
 
-INSTANTIATE_CLASS(BNLLLayer);
-REGISTER_LAYER_CLASS(BNLL);
+INSTANTIATE_CLASS (BNLLLayer);
+REGISTER_LAYER_CLASS (BNLL);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 6bc8f9e9..b885d9e6 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -6,133 +6,141 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const ConcatParameter& concat_param = this->layer_param_.concat_param();
-  CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
-      << "Either axis or concat_dim should be specified; not both.";
+	const vector<Blob<Dtype>*>& top) {
+	const ConcatParameter& concat_param = this->layer_param_.concat_param();
+	CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
+		<< "Either axis or concat_dim should be specified; not both.";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int num_axes = bottom[0]->num_axes();
-  const ConcatParameter& concat_param = this->layer_param_.concat_param();
-  if (concat_param.has_concat_dim()) {
-    concat_axis_ = static_cast<int>(concat_param.concat_dim());
-    // Don't allow negative indexing for concat_dim, a uint32 -- almost
-    // certainly unintended.
-    CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
-        << "produced negative result; concat_dim must satisfy "
-        << "0 <= concat_dim < " << kMaxBlobAxes;
-    CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
-  } else {
-    concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
-  }
-  // Initialize with the first blob.
-  vector<int> top_shape = bottom[0]->shape();
-  num_concats_ = bottom[0]->count(0, concat_axis_);
-  concat_input_size_ = bottom[0]->count(concat_axis_ + 1);
-  int bottom_count_sum = bottom[0]->count();
-  for (int i = 1; i < bottom.size(); ++i) {
-    CHECK_EQ(num_axes, bottom[i]->num_axes())
-        << "All inputs must have the same #axes.";
-    for (int j = 0; j < num_axes; ++j) {
-      if (j == concat_axis_) { continue; }
-      CHECK_EQ(top_shape[j], bottom[i]->shape(j))
-          << "All inputs must have the same shape, except at concat_axis.";
-    }
-    bottom_count_sum += bottom[i]->count();
-    top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
-  }
-  top[0]->Reshape(top_shape);
-  CHECK_EQ(bottom_count_sum, top[0]->count());
+	const vector<Blob<Dtype>*>& top) {
+	const int num_axes = bottom[0]->num_axes();
+	const ConcatParameter& concat_param = this->layer_param_.concat_param();
+	if (concat_param.has_concat_dim()) {
+		concat_axis_ = static_cast<int>(concat_param.concat_dim());
+		// Don't allow negative indexing for concat_dim, a uint32 -- almost
+		// certainly unintended.
+		CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
+			<< "produced negative result; concat_dim must satisfy "
+			<< "0 <= concat_dim < " << kMaxBlobAxes;
+		CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
+	} else {
+		concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
+	}
+	// Initialize with the first blob.
+	vector<int> top_shape = bottom[0]->shape();
+	num_concats_ = bottom[0]->count(0, concat_axis_);
+	concat_input_size_ = bottom[0]->count(concat_axis_ + 1);
+	int bottom_count_sum = bottom[0]->count();
+	for (int i = 1; i < bottom.size(); ++i) {
+		CHECK_EQ(num_axes, bottom[i]->num_axes())
+			<< "All inputs must have the same #axes.";
+		for (int j = 0; j < num_axes; ++j) {
+			if (j == concat_axis_) {
+				continue;
+			}
+			CHECK_EQ(top_shape[j], bottom[i]->shape(j))
+				<< "All inputs must have the same shape, except at concat_axis.";
+		}
+		bottom_count_sum += bottom[i]->count();
+		top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
+	}
+	top[0]->Reshape(top_shape);
+	CHECK_EQ(bottom_count_sum, top[0]->count());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->cpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    for (int n = 0; n < num_concats_; ++n) {
-      caffe_copy(bottom_concat_axis * concat_input_size_,
-          bottom_data + n * bottom_concat_axis * concat_input_size_,
-          top_data + (n * top_concat_axis + offset_concat_axis)
-              * concat_input_size_);
-    }
-    offset_concat_axis += bottom_concat_axis;
-  }
+	const vector<Blob<Dtype>*>& top) {
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	int offset_concat_axis = 0;
+	const int top_concat_axis = top[0]->shape(concat_axis_);
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->cpu_data();
+		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+		for (int n = 0; n < num_concats_; ++n) {
+			caffe_copy(bottom_concat_axis * concat_input_size_,
+				bottom_data + n * bottom_concat_axis * concat_input_size_,
+				top_data + (n * top_concat_axis + offset_concat_axis)
+					* concat_input_size_);
+		}
+		offset_concat_axis += bottom_concat_axis;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    for (int n = 0; n < num_concats_; ++n) {
-      caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
-          (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
-          bottom_diff + n * bottom_concat_axis * concat_input_size_);
-    }
-    offset_concat_axis += bottom_concat_axis;
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->cpu_diff();
+	int offset_concat_axis = 0;
+	const int top_concat_axis = top[0]->shape(concat_axis_);
+	for (int i = 0; i < bottom.size(); ++i) {
+		if (!propagate_down[i]) {
+			continue;
+		}
+		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+		for (int n = 0; n < num_concats_; ++n) {
+			caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
+				(n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+				bottom_diff + n * bottom_concat_axis * concat_input_size_);
+		}
+		offset_concat_axis += bottom_concat_axis;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  if (bottom.size() == 1) { return; }
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-    offset_concat_axis += bottom_concat_axis;
-  }
+	const vector<Blob<Dtype>*>& top) {
+	if (bottom.size() == 1) {
+		return;
+	}
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	int offset_concat_axis = 0;
+	const int top_concat_axis = top[0]->shape(concat_axis_);
+	const bool kForward = true;
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->gpu_data();
+		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+		const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+		const int nthreads = bottom_concat_size * num_concats_;
+		Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
+			top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
+		offset_concat_axis += bottom_concat_axis;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (bottom.size() == 1) { return; }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    if (propagate_down[i]) {
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-      const int nthreads = bottom_concat_size * num_concats_;
-      Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-    }
-    offset_concat_axis += bottom_concat_axis;
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (bottom.size() == 1) {
+		return;
+	}
+	const Dtype* top_diff = top[0]->gpu_diff();
+	int offset_concat_axis = 0;
+	const int top_concat_axis = top[0]->shape(concat_axis_);
+	const bool kForward = false;
+	for (int i = 0; i < bottom.size(); ++i) {
+		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+		if (propagate_down[i]) {
+			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+			const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+			const int nthreads = bottom_concat_size * num_concats_;
+			Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+				top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+		}
+		offset_concat_axis += bottom_concat_axis;
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(ConcatLayer);
 #endif
 
-INSTANTIATE_CLASS(ConcatLayer);
-REGISTER_LAYER_CLASS(Concat);
+INSTANTIATE_CLASS (ConcatLayer);
+REGISTER_LAYER_CLASS (Concat);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 4b47eb42..9c3f38d5 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -8,180 +8,180 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ContrastiveLossLayer<Dtype>::LayerSetUp(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
-  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
-  CHECK_EQ(bottom[0]->height(), 1);
-  CHECK_EQ(bottom[0]->width(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  CHECK_EQ(bottom[2]->channels(), 1);
-  CHECK_EQ(bottom[2]->height(), 1);
-  CHECK_EQ(bottom[2]->width(), 1);
-  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
-  diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
-  dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1);
-  // vector of ones used to sum along channels
-  summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
-  for (int i = 0; i < bottom[0]->channels(); ++i)
-    summer_vec_.mutable_cpu_data()[i] = Dtype(1);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::LayerSetUp(bottom, top);
+	CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
+	CHECK_EQ(bottom[0]->height(), 1);
+	CHECK_EQ(bottom[0]->width(), 1);
+	CHECK_EQ(bottom[1]->height(), 1);
+	CHECK_EQ(bottom[1]->width(), 1);
+	CHECK_EQ(bottom[2]->channels(), 1);
+	CHECK_EQ(bottom[2]->height(), 1);
+	CHECK_EQ(bottom[2]->width(), 1);
+	diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+	diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+	dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1);
+	// vector of ones used to sum along channels
+	summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
+	for (int i = 0; i < bottom[0]->channels(); ++i)
+		summer_vec_.mutable_cpu_data()[i] = Dtype(1);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_sub(
-      count,
-      bottom[0]->cpu_data(),  // a
-      bottom[1]->cpu_data(),  // b
-      diff_.mutable_cpu_data());  // a_i-b_i
-  const int channels = bottom[0]->channels();
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
-        diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels));
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
+	const vector<Blob<Dtype>*>& bottom,
+	const vector<Blob<Dtype>*>& top) {
+	int count = bottom[0]->count();
+	caffe_sub(
+		count,
+		bottom[0]->cpu_data(),  // a
+		bottom[1]->cpu_data(),  // b
+		diff_.mutable_cpu_data());  // a_i-b_i
+	const int channels = bottom[0]->channels();
+	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+	bool legacy_version =
+		this->layer_param_.contrastive_loss_param().legacy_version();
+	Dtype loss(0.0);
+	for (int i = 0; i < bottom[0]->num(); ++i) {
+		dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
+			diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
+		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+			loss += dist_sq_.cpu_data()[i];
+		} else {  // dissimilar pairs
+			if (legacy_version) {
+				loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+			} else {
+				Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+				loss += dist * dist;
+			}
+		}
+	}
+	loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[i]->num());
-      int num = bottom[i]->num();
-      int channels = bottom[i]->channels();
-      for (int j = 0; j < num; ++j) {
-        Dtype* bout = bottom[i]->mutable_cpu_diff();
-        if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
-          caffe_cpu_axpby(
-              channels,
-              alpha,
-              diff_.cpu_data() + (j*channels),
-              Dtype(0.0),
-              bout + (j*channels));
-        } else {  // dissimilar pairs
-          Dtype mdist(0.0);
-          Dtype beta(0.0);
-          if (legacy_version) {
-            mdist = margin - dist_sq_.cpu_data()[j];
-            beta = -alpha;
-          } else {
-            Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
-            mdist = margin - dist;
-            beta = -alpha * mdist / (dist + Dtype(1e-4));
-          }
-          if (mdist > Dtype(0.0)) {
-            caffe_cpu_axpby(
-                channels,
-                beta,
-                diff_.cpu_data() + (j*channels),
-                Dtype(0.0),
-                bout + (j*channels));
-          } else {
-            caffe_set(channels, Dtype(0), bout + (j*channels));
-          }
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+	bool legacy_version =
+		this->layer_param_.contrastive_loss_param().legacy_version();
+	for (int i = 0; i < 2; ++i) {
+		if (propagate_down[i]) {
+			const Dtype sign = (i == 0) ? 1 : -1;
+			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+				static_cast<Dtype>(bottom[i]->num());
+			int num = bottom[i]->num();
+			int channels = bottom[i]->channels();
+			for (int j = 0; j < num; ++j) {
+				Dtype* bout = bottom[i]->mutable_cpu_diff();
+				if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
+					caffe_cpu_axpby(
+						channels,
+						alpha,
+						diff_.cpu_data() + (j * channels),
+						Dtype(0.0),
+						bout + (j * channels));
+				} else {  // dissimilar pairs
+					Dtype mdist(0.0);
+					Dtype beta(0.0);
+					if (legacy_version) {
+						mdist = margin - dist_sq_.cpu_data()[j];
+						beta = -alpha;
+					} else {
+						Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
+						mdist = margin - dist;
+						beta = -alpha * mdist / (dist + Dtype(1e-4));
+					}
+					if (mdist > Dtype(0.0)) {
+						caffe_cpu_axpby(
+							channels,
+							beta,
+							diff_.cpu_data() + (j * channels),
+							Dtype(0.0),
+							bout + (j * channels));
+					} else {
+						caffe_set(channels, Dtype(0), bout + (j * channels));
+					}
+				}
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),  // a
-      bottom[1]->gpu_data(),  // b
-      diff_.mutable_gpu_data());  // a_i-b_i
-  caffe_gpu_powx(
-      count,
-      diff_.mutable_gpu_data(),  // a_i-b_i
-      Dtype(2),
-      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-  caffe_gpu_gemv(
-      CblasNoTrans,
-      bottom[0]->num(),
-      bottom[0]->channels(),
-      Dtype(1.0),
-      diff_sq_.gpu_data(),  // (a_i-b_i)^2
-      summer_vec_.gpu_data(),
-      Dtype(0.0),
-      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	const int count = bottom[0]->count();
+	caffe_gpu_sub(
+		count,
+		bottom[0]->gpu_data(),  // a
+		bottom[1]->gpu_data(),  // b
+		diff_.mutable_gpu_data());  // a_i-b_i
+	caffe_gpu_powx(
+		count,
+		diff_.mutable_gpu_data(),  // a_i-b_i
+		Dtype(2),
+		diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
+	caffe_gpu_gemv(
+		CblasNoTrans,
+		bottom[0]->num(),
+		bottom[0]->channels(),
+		Dtype(1.0),
+		diff_sq_.gpu_data(),  // (a_i-b_i)^2
+		summer_vec_.gpu_data(),
+		Dtype(0.0),
+		dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
+	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+	bool legacy_version =
+		this->layer_param_.contrastive_loss_param().legacy_version();
+	Dtype loss(0.0);
+	for (int i = 0; i < bottom[0]->num(); ++i) {
+		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+			loss += dist_sq_.cpu_data()[i];
+		} else {  // dissimilar pairs
+			if (legacy_version) {
+				loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+			} else {
+				Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+				loss += dist * dist;
+			}
+		}
+	}
+	loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const int count = bottom[0]->count();
-      const int channels = bottom[0]->channels();
-      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-      const bool legacy_version =
-          this->layer_param_.contrastive_loss_param().legacy_version();
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[0]->num());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLBackward(count, channels, margin, legacy_version, alpha,
-          bottom[2]->gpu_data(),  // pair similarity 0 or 1
-          diff_.gpu_data(),  // the cached eltwise difference between a and b
-          dist_sq_.gpu_data(),  // the cached square distance between a and b
-          bottom[i]->mutable_gpu_diff());
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	for (int i = 0; i < 2; ++i) {
+		if (propagate_down[i]) {
+			const int count = bottom[0]->count();
+			const int channels = bottom[0]->channels();
+			Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+			const bool legacy_version =
+				this->layer_param_.contrastive_loss_param().legacy_version();
+			const Dtype sign = (i == 0) ? 1 : -1;
+			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
+				static_cast<Dtype>(bottom[0]->num());
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			CLLBackward(count, channels, margin, legacy_version, alpha,
+				bottom[2]->gpu_data(),  // pair similarity 0 or 1
+				diff_.gpu_data(),  // the cached eltwise difference between a and b
+				dist_sq_.gpu_data(),  // the cached square distance between a and b
+				bottom[i]->mutable_gpu_diff());
+		}
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(ContrastiveLossLayer);
 #endif
 
-INSTANTIATE_CLASS(ContrastiveLossLayer);
-REGISTER_LAYER_CLASS(ContrastiveLoss);
+INSTANTIATE_CLASS (ContrastiveLossLayer);
+REGISTER_LAYER_CLASS (ContrastiveLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c829dbd7..d5ffdb9f 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -7,232 +7,236 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::compute_output_shape() {
-  this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
-      / this->stride_h_ + 1;
-  this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
-      / this->stride_w_ + 1;
+	this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
+		/ this->stride_h_ + 1;
+	this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
+		/ this->stride_w_ + 1;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->cpu_data();
-    Dtype* top_data = top[i]->mutable_cpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-
- // CHECK_BLOB_DATA(top[0],20, "top[0]");
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* weight = this->blobs_[0]->cpu_data();
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->cpu_data();
+		Dtype* top_data = top[i]->mutable_cpu_data();
+		for (int n = 0; n < this->num_; ++n) {
+			this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+				top_data + top[i]->offset(n));
+			if (this->bias_term_) {
+				const Dtype* bias = this->blobs_[1]->cpu_data();
+				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+			}
+		}
+	}
+
+	// CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->cpu_diff();
-    const Dtype* bottom_data = bottom[i]->cpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* weight = this->blobs_[0]->cpu_data();
+	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->cpu_diff();
+		const Dtype* bottom_data = bottom[i]->cpu_data();
+		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+		// Bias gradient, if necessary.
+		if (this->bias_term_ && this->param_propagate_down_[1]) {
+			Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+			for (int n = 0; n < this->num_; ++n) {
+				this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+			}
+		}
+		if (this->param_propagate_down_[0] || propagate_down[i]) {
+			for (int n = 0; n < this->num_; ++n) {
+				// gradient w.r.t. weight. Note that we will accumulate diffs.
+				if (this->param_propagate_down_[0]) {
+					this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
+						top_diff + top[i]->offset(n), weight_diff);
+				}
+				// gradient w.r.t. bottom data, if necessary.
+				if (propagate_down[i]) {
+					this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+						bottom_diff + bottom[i]->offset(n));
+				}
+			}
+		}
+	}
 
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const  vector<Blob<Dtype>*>& top) {
-  if (use_packing_scheme && global_packing_N >1)
-   Forward_gpu_opt2(bottom, top);
-  else
-   Forward_gpu_org(bottom, top);
+	const vector<Blob<Dtype>*>& top) {
+	if (use_packing_scheme && global_packing_N > 1)
+		Forward_gpu_opt2(bottom, top);
+	else
+		Forward_gpu_org(bottom, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    if (use_packing_scheme && global_packing_N >1)
-      Backward_gpu_opt2(top, propagate_down, bottom);
-    else
-      Backward_gpu_org(top, propagate_down, bottom);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (use_packing_scheme && global_packing_N > 1)
+		Backward_gpu_opt2(top, propagate_down, bottom);
+	else
+		Backward_gpu_org(top, propagate_down, bottom);
 }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
-
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    this->opt_num2 = global_packing_N;
-    this->weight_offset_ = this->M_ * this->K_;
-    for (int n = 0; n < this->num_; n += this->opt_num2) {
-      this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
-       //intermediate variables to pass offset
-      this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
-      this->top_offset_ = top[i]->offset(n);
-      this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
-      this->bottom_offset_ = bottom[i]->offset(n);
-      this->forward_gpu_gemm_opt(bottom_data, weight,
-            top_data);
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-          this->forward_gpu_bias_opt(top_data, bias);
-      }
-   }
-  }
-
-  //CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+template<typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
+	const vector<Blob<Dtype>*>& bottom,
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->gpu_data();
+		//CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+		Dtype* top_data = top[i]->mutable_gpu_data();
+		this->opt_num2 = global_packing_N;
+		this->weight_offset_ = this->M_ * this->K_;
+		for (int n = 0; n < this->num_; n += this->opt_num2) {
+			this->opt_num2 =
+				this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+			//intermediate variables to pass offset
+			this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
+			this->top_offset_ = top[i]->offset(n);
+			this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
+			this->bottom_offset_ = bottom[i]->offset(n);
+			this->forward_gpu_gemm_opt(bottom_data, weight,
+				top_data);
+			if (this->bias_term_) {
+				const Dtype* bias = this->blobs_[1]->gpu_data();
+				this->forward_gpu_bias_opt(top_data, bias);
+			}
+		}
+	}
+
+	//CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+	//CHECK_BLOB_DATA(top[0],20, "top[0]");
 
 }
 
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
-
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-       //two intermediate variables to pass offset
-       this->bottom_offset_ = bottom[i]->offset(n);
-       this->top_offset_ = top[i]->offset(n); 
-       this->forward_gpu_gemm(bottom_data, weight,
-            top_data);
-
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-          this->forward_gpu_bias(top_data, bias);
-      }
-    }
-  }
-
-  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+template<typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_org(
+	const vector<Blob<Dtype>*>& bottom,
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->gpu_data();
+		//CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+		Dtype* top_data = top[i]->mutable_gpu_data();
+		for (int n = 0; n < this->num_; ++n) {
+			//two intermediate variables to pass offset
+			this->bottom_offset_ = bottom[i]->offset(n);
+			this->top_offset_ = top[i]->offset(n);
+			this->forward_gpu_gemm(bottom_data, weight,
+				top_data);
+
+			if (this->bias_term_) {
+				const Dtype* bias = this->blobs_[1]->gpu_data();
+				this->forward_gpu_bias(top_data, bias);
+			}
+		}
+	}
+
+	// CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+	//CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
-      for (int n = 0; n < this->num_; ++n) {
-        this->top_offset_ = top[i]->offset(n);
-        this->backward_gpu_bias(bias_diff, top_diff);
-      }
-     }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      this->weight_offset_ = this->M_ * this->K_;
-      this->opt_num2 = global_packing_N;
-      for (int n = 0; n < this->num_; n += this->opt_num2) {
-        this->opt_num2 = this->opt_num2 > (this->num_ - n)? (this->num_ - n) : this->opt_num2;
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm_opt(bottom_data,
-              top_diff, weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm_opt(top_diff, weight,
-              bottom_diff);
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->gpu_diff();
+
+		// Bias gradient, if necessary.
+		if (this->bias_term_ && this->param_propagate_down_[1]) {
+			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+			ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
+			for (int n = 0; n < this->num_; ++n) {
+				this->top_offset_ = top[i]->offset(n);
+				this->backward_gpu_bias(bias_diff, top_diff);
+			}
+		}
+		if (this->param_propagate_down_[0] || propagate_down[i]) {
+			const Dtype* bottom_data = bottom[i]->gpu_data();
+			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+			this->weight_offset_ = this->M_ * this->K_;
+			this->opt_num2 = global_packing_N;
+			for (int n = 0; n < this->num_; n += this->opt_num2) {
+				this->opt_num2 =
+					this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+				this->top_offset_ = top[i]->offset(n);
+				this->bottom_offset_ = bottom[i]->offset(n);
+				this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+				this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
+				// gradient w.r.t. weight. Note that we will accumulate diffs.
+				if (this->param_propagate_down_[0]) {
+					this->weight_gpu_gemm_opt(bottom_data,
+						top_diff, weight_diff);
+				}
+				// gradient w.r.t. bottom data, if necessary.
+				if (propagate_down[i]) {
+					this->backward_gpu_gemm_opt(top_diff, weight,
+						bottom_diff);
+				}
+			}
+		}
+	}
 
 }
-template <typename Dtype>
+template<typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-       //
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->backward_gpu_bias(bias_diff, top_diff);
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data,
-              top_diff, weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff, weight,
-              bottom_diff);
-        }
-      }
-    }
-  }
-  
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->gpu_diff();
+
+		// Bias gradient, if necessary.
+		if (this->bias_term_ && this->param_propagate_down_[1]) {
+			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+			for (int n = 0; n < this->num_; ++n) {
+				//
+				this->top_offset_ = top[i]->offset(n);
+				this->bottom_offset_ = bottom[i]->offset(n);
+				this->backward_gpu_bias(bias_diff, top_diff);
+			}
+		}
+		if (this->param_propagate_down_[0] || propagate_down[i]) {
+			const Dtype* bottom_data = bottom[i]->gpu_data();
+			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+			for (int n = 0; n < this->num_; ++n) {
+				this->top_offset_ = top[i]->offset(n);
+				this->bottom_offset_ = bottom[i]->offset(n);
+				// gradient w.r.t. weight. Note that we will accumulate diffs.
+				if (this->param_propagate_down_[0]) {
+					this->weight_gpu_gemm(bottom_data,
+						top_diff, weight_diff);
+				}
+				// gradient w.r.t. bottom data, if necessary.
+				if (propagate_down[i]) {
+					this->backward_gpu_gemm(top_diff, weight,
+						bottom_diff);
+				}
+			}
+		}
+	}
+
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(ConvolutionLayer);
 #endif
 
-INSTANTIATE_CLASS(ConvolutionLayer);
+INSTANTIATE_CLASS (ConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 8ac9b8ee..bff8b10c 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -16,113 +16,113 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 DataLayer<Dtype>::~DataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+	this->JoinPrefetchThread();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Initialize DB
-  db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
-  db_->Open(this->layer_param_.data_param().source(), db::READ);
-  cursor_.reset(db_->NewCursor());
+	const vector<Blob<Dtype>*>& top) {
+	// Initialize DB
+	db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
+	db_->Open(this->layer_param_.data_param().source(), db::READ);
+	cursor_.reset(db_->NewCursor());
 
-  // Check if we should randomly skip a few data points
-  if (this->layer_param_.data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-                        this->layer_param_.data_param().rand_skip();
-    LOG(INFO) << "Skipping first " << skip << " data points.";
-    while (skip-- > 0) {
-      cursor_->Next();
-    }
-  }
-  // Read a data point, to initialize the prefetch and top blobs.
-  Datum datum;
-  datum.ParseFromString(cursor_->value());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape top[0] and prefetch_data according to the batch_size.
-  top_shape[0] = this->layer_param_.data_param().batch_size();
-  this->prefetch_data_.Reshape(top_shape);
-  top[0]->ReshapeLike(this->prefetch_data_);
-  this->prefetch_data_.set_data_layer();
+	// Check if we should randomly skip a few data points
+	if (this->layer_param_.data_param().rand_skip()) {
+		unsigned int skip = caffe_rng_rand() %
+			this->layer_param_.data_param().rand_skip();
+		LOG(INFO) << "Skipping first " << skip << " data points.";
+		while (skip-- > 0) {
+			cursor_->Next();
+		}
+	}
+	// Read a data point, to initialize the prefetch and top blobs.
+	Datum datum;
+	datum.ParseFromString(cursor_->value());
+	// Use data_transformer to infer the expected blob shape from datum.
+	vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+	this->transformed_data_.Reshape(top_shape);
+	// Reshape top[0] and prefetch_data according to the batch_size.
+	top_shape[0] = this->layer_param_.data_param().batch_size();
+	this->prefetch_data_.Reshape(top_shape);
+	top[0]->ReshapeLike(this->prefetch_data_);
+	this->prefetch_data_.set_data_layer();
 
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  if (this->output_labels_) {
-    vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
-    top[1]->Reshape(label_shape);
-    this->prefetch_label_.Reshape(label_shape);
-    this->prefetch_label_.set_data_layer();
-  }
+	LOG(INFO) << "output data size: " << top[0]->num() << ","
+		<< top[0]->channels() << "," << top[0]->height() << ","
+		<< top[0]->width();
+	// label
+	if (this->output_labels_) {
+		vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
+		top[1]->Reshape(label_shape);
+		this->prefetch_label_.Reshape(label_shape);
+		this->prefetch_label_.set_data_layer();
+	}
 }
 
 // This function is used to create a thread that prefetches the data.
-template <typename Dtype>
+template<typename Dtype>
 void DataLayer<Dtype>::InternalThreadEntry() {
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  CHECK(this->prefetch_data_.count());
-  CHECK(this->transformed_data_.count());
+	CPUTimer batch_timer;
+	batch_timer.Start();
+	double read_time = 0;
+	double trans_time = 0;
+	CPUTimer timer;
+	CHECK(this->prefetch_data_.count());
+	CHECK(this->transformed_data_.count());
 
-  // Reshape according to the first datum of each batch
-  // on single input batches allows for inputs of varying dimension.
-  const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum datum;
-  datum.ParseFromString(cursor_->value());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data according to the batch_size.
-  top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
+	// Reshape according to the first datum of each batch
+	// on single input batches allows for inputs of varying dimension.
+	const int batch_size = this->layer_param_.data_param().batch_size();
+	Datum datum;
+	datum.ParseFromString(cursor_->value());
+	// Use data_transformer to infer the expected blob shape from datum.
+	vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+	this->transformed_data_.Reshape(top_shape);
+	// Reshape prefetch_data according to the batch_size.
+	top_shape[0] = batch_size;
+	this->prefetch_data_.Reshape(top_shape);
 
-  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
-  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
+	Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+	Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
-  if (this->output_labels_) {
-    top_label = this->prefetch_label_.mutable_cpu_data();
-  }
-  timer.Start();
-  for (int item_id = 0; item_id < batch_size; ++item_id) {
-    // get a datum
-    Datum datum;
-    datum.ParseFromString(cursor_->value());
-    read_time += timer.MicroSeconds();
-    timer.Start();
-    // Apply data transformations (mirror, scale, crop...)
-    int offset = this->prefetch_data_.offset(item_id);
-    this->transformed_data_.set_cpu_data(top_data + offset);
-    this->data_transformer_->Transform(datum, &(this->transformed_data_));
-    // Copy label.
-    if (this->output_labels_) {
-      top_label[item_id] = datum.label();
-    }
-    trans_time += timer.MicroSeconds();
-    timer.Start();
-    // go to the next item.
-    cursor_->Next();
-    if (!cursor_->valid()) {
-      DLOG(INFO) << "Restarting data prefetching from start.";
-      cursor_->SeekToFirst();
-    }
-  }
-  timer.Stop();
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+	if (this->output_labels_) {
+		top_label = this->prefetch_label_.mutable_cpu_data();
+	}
+	timer.Start();
+	for (int item_id = 0; item_id < batch_size; ++item_id) {
+		// get a datum
+		Datum datum;
+		datum.ParseFromString(cursor_->value());
+		read_time += timer.MicroSeconds();
+		timer.Start();
+		// Apply data transformations (mirror, scale, crop...)
+		int offset = this->prefetch_data_.offset(item_id);
+		this->transformed_data_.set_cpu_data(top_data + offset);
+		this->data_transformer_->Transform(datum, &(this->transformed_data_));
+		// Copy label.
+		if (this->output_labels_) {
+			top_label[item_id] = datum.label();
+		}
+		trans_time += timer.MicroSeconds();
+		timer.Start();
+		// go to the next item.
+		cursor_->Next();
+		if (!cursor_->valid()) {
+			DLOG(INFO) << "Restarting data prefetching from start.";
+			cursor_->SeekToFirst();
+		}
+	}
+	timer.Stop();
+	batch_timer.Stop();
+	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(DataLayer);
-REGISTER_LAYER_CLASS(Data);
+INSTANTIATE_CLASS (DataLayer);
+REGISTER_LAYER_CLASS (Data);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 4b952c73..aa61a755 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -8,129 +8,128 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void DeconvolutionLayer<Dtype>::compute_output_shape() {
-  this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
-      - 2 * this->pad_h_;
-  this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
-      - 2 * this->pad_w_;
+	this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
+		- 2 * this->pad_h_;
+	this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
+		- 2 * this->pad_w_;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->cpu_data();
-    Dtype* top_data = top[i]->mutable_cpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* weight = this->blobs_[0]->cpu_data();
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->cpu_data();
+		Dtype* top_data = top[i]->mutable_cpu_data();
+		for (int n = 0; n < this->num_; ++n) {
+			this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+				top_data + top[i]->offset(n));
+			if (this->bias_term_) {
+				const Dtype* bias = this->blobs_[1]->cpu_data();
+				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->cpu_diff();
-    const Dtype* bottom_data = bottom[i]->cpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // Gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
-        }
-        // Gradient w.r.t. bottom data, if necessary, reusing the column buffer
-        // we might have just computed above.
-        if (propagate_down[i]) {
-          this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n),
-              this->param_propagate_down_[0]);
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* weight = this->blobs_[0]->cpu_data();
+	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->cpu_diff();
+		const Dtype* bottom_data = bottom[i]->cpu_data();
+		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+		// Bias gradient, if necessary.
+		if (this->bias_term_ && this->param_propagate_down_[1]) {
+			Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+			for (int n = 0; n < this->num_; ++n) {
+				this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+			}
+		}
+		if (this->param_propagate_down_[0] || propagate_down[i]) {
+			for (int n = 0; n < this->num_; ++n) {
+				// Gradient w.r.t. weight. Note that we will accumulate diffs.
+				if (this->param_propagate_down_[0]) {
+					this->weight_cpu_gemm(top_diff + top[i]->offset(n),
+						bottom_data + bottom[i]->offset(n), weight_diff);
+				}
+				// Gradient w.r.t. bottom data, if necessary, reusing the column buffer
+				// we might have just computed above.
+				if (propagate_down[i]) {
+					this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+						bottom_diff + bottom[i]->offset(n),
+						this->param_propagate_down_[0]);
+				}
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->bottom_offset_ = bottom[i]->offset(n);
-      this->top_offset_ = top[i]->offset(n);
-      this->backward_gpu_gemm(bottom_data, weight, top_data);
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data, bias);
-      }
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	for (int i = 0; i < bottom.size(); ++i) {
+		const Dtype* bottom_data = bottom[i]->gpu_data();
+		Dtype* top_data = top[i]->mutable_gpu_data();
+		for (int n = 0; n < this->num_; ++n) {
+			this->bottom_offset_ = bottom[i]->offset(n);
+			this->top_offset_ = top[i]->offset(n);
+			this->backward_gpu_gemm(bottom_data, weight, top_data);
+			if (this->bias_term_) {
+				const Dtype* bias = this->blobs_[1]->gpu_data();
+				this->forward_gpu_bias(top_data, bias);
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->gpu_diff();
+		const Dtype* bottom_data = bottom[i]->gpu_data();
+		Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+		// Bias gradient, if necessary.
+		if (this->bias_term_ && this->param_propagate_down_[1]) {
+			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+			for (int n = 0; n < this->num_; ++n) {
+				this->top_offset_ = top[i]->offset(n);
+				this->bottom_offset_ = bottom[i]->offset(n);
+				this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+			}
+		}
+		if (this->param_propagate_down_[0] || propagate_down[i]) {
+			for (int n = 0; n < this->num_; ++n) {
+				this->top_offset_ = top[i]->offset(n);
+				this->bottom_offset_ = bottom[i]->offset(n);
+				// gradient w.r.t. weight. Note that we will accumulate diffs.
+				if (this->param_propagate_down_[0]) {
+					this->weight_gpu_gemm(top_diff + top[i]->offset(n),
+						bottom_data + bottom[i]->offset(n), weight_diff);
+				}
+				// gradient w.r.t. bottom data, if necessary.
+				if (propagate_down[i]) {
+					this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+						bottom_diff + bottom[i]->offset(n));
+				}
+			}
+		}
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(DeconvolutionLayer);
 #endif
 
-INSTANTIATE_CLASS(DeconvolutionLayer);
-REGISTER_LAYER_CLASS(Deconvolution);
+INSTANTIATE_CLASS (DeconvolutionLayer);
+REGISTER_LAYER_CLASS (Deconvolution);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 4175a2b7..ae045c5c 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -10,122 +10,122 @@
 
 namespace caffe {
 
-template <typename Dtype>
-void DropoutLayer<Dtype>::ocl_setup(int bottom_count){
-    MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, bottom_count*sizeof(int), NULL, NULL);
+template<typename Dtype>
+void DropoutLayer<Dtype>::ocl_setup(int bottom_count) {
+	MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		bottom_count * sizeof(int), NULL, NULL);
 }
 
-template <typename Dtype>
-DropoutLayer<Dtype>::~DropoutLayer(){
-   OCL_CHECK( clReleaseMemObject(MaskMem) );
-}
-
-
-template <typename Dtype>
+template<typename Dtype>
+DropoutLayer<Dtype>::~DropoutLayer() {
+	OCL_CHECK (clReleaseMemObject(MaskMem) );
+	}
+template<typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  threshold_ = this->layer_param_.dropout_param().dropout_ratio();
-  DCHECK(threshold_ > 0.);
-  DCHECK(threshold_ < 1.);
-  scale_ = 1. / (1. - threshold_);
-  uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
-  ocl_setup(bottom[0]->count());
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	threshold_ = this->layer_param_.dropout_param().dropout_ratio();
+	DCHECK(threshold_ > 0.);
+	DCHECK(threshold_ < 1.);
+	scale_ = 1. / (1. - threshold_);
+	uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+	ocl_setup(bottom[0]->count());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DropoutLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::Reshape(bottom, top);
-  // Set up the cache for random number generation
-  rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::Reshape(bottom, top);
+	// Set up the cache for random number generation
+	rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		bottom[0]->height(), bottom[0]->width());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  unsigned int* mask = rand_vec_.mutable_cpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    // Create random numbers
-    caffe_rng_bernoulli(count, 1. - threshold_, mask);
-    for (int i = 0; i < count; ++i) {
-      top_data[i] = bottom_data[i] * mask[i] * scale_;
-    }
-  } else {
-    caffe_copy(bottom[0]->count(), bottom_data, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	unsigned int* mask = rand_vec_.mutable_cpu_data();
+	const int count = bottom[0]->count();
+	if (this->phase_ == TRAIN) {
+		// Create random numbers
+		caffe_rng_bernoulli(count, 1. - threshold_, mask);
+		for (int i = 0; i < count; ++i) {
+			top_data[i] = bottom_data[i] * mask[i] * scale_;
+		}
+	} else {
+		caffe_copy(bottom[0]->count(), bottom_data, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    if (this->phase_ == TRAIN) {
-      const unsigned int* mask = rand_vec_.cpu_data();
-      const int count = bottom[0]->count();
-      for (int i = 0; i < count; ++i) {
-        bottom_diff[i] = top_diff[i] * mask[i] * scale_;
-      }
-    } else {
-      caffe_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_diff = top[0]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		if (this->phase_ == TRAIN) {
+			const unsigned int* mask = rand_vec_.cpu_data();
+			const int count = bottom[0]->count();
+			for (int i = 0; i < count; ++i) {
+				bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+			}
+		} else {
+			caffe_copy(top[0]->count(), top_diff, bottom_diff);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    //unsigned int* mask =
-      //  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	if (this->phase_ == TRAIN) {
+		//unsigned int* mask =
+		//  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
 #ifdef use_cpu_generator_dropout 
-    unsigned int* mask_cpu =
-        static_cast<unsigned int*>(rand_vec_.mutable_cpu_data()); 
-    caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
-    OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
-    DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+		unsigned int* mask_cpu =
+		static_cast<unsigned int*>(rand_vec_.mutable_cpu_data());
+		caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
+		OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
+		DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
-     caffe_gpu_bernoulli((int*)MaskMem, count, (Dtype)0., (Dtype)1., threshold_);
-     DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+		caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1.,
+			threshold_);
+		DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_,
+			top_data);
 #endif
-  } else {
-    caffe_gpu_copy(count, bottom_data, top_data);
-  }
+	} else {
+		caffe_gpu_copy(count, bottom_data, top_data);
+	}
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    if (this->phase_ == TRAIN) {
-      const int count = bottom[0]->count();
-       DropoutBackward(count, top_diff, (int*)MaskMem, uint_thres_ , (Dtype)scale_, bottom_diff);
-    } else {
-      caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_diff = top[0]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		if (this->phase_ == TRAIN) {
+			const int count = bottom[0]->count();
+			DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_,
+				(Dtype) scale_, bottom_diff);
+		} else {
+			caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
+		}
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(DropoutLayer);
 #endif
 
-INSTANTIATE_CLASS(DropoutLayer);
-REGISTER_LAYER_CLASS(Dropout);
+INSTANTIATE_CLASS (DropoutLayer);
+REGISTER_LAYER_CLASS (Dropout);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index 6b0d6174..8a3fe17e 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -6,110 +6,110 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int num_top = top.size();
-  const DummyDataParameter& param = this->layer_param_.dummy_data_param();
-  const int num_data_filler = param.data_filler_size();
-  CHECK(num_data_filler == 0 || num_data_filler == 1 ||
-        num_data_filler == num_top)
-      << "Number of data fillers must be 0, 1 or equal to the number of tops: "
-      << num_top << "; you specified " << num_data_filler << " data fillers.";
+	const vector<Blob<Dtype>*>& top) {
+	const int num_top = top.size();
+	const DummyDataParameter& param = this->layer_param_.dummy_data_param();
+	const int num_data_filler = param.data_filler_size();
+	CHECK(num_data_filler == 0 || num_data_filler == 1 ||
+		num_data_filler == num_top)
+		<< "Number of data fillers must be 0, 1 or equal to the number of tops: "
+		<< num_top << "; you specified " << num_data_filler << " data fillers.";
 
-  const bool legacy_dims = param.num_size() || param.channels_size() ||
-                           param.height_size() || param.width_size();
-  if (legacy_dims) {
-    CHECK_EQ(0, param.shape_size())
-        << "Both shape and legacy fields were specified";
-    // Using deprecated 4D output dim specifiers.
-    CHECK(param.num_size() == 1 || param.num_size() == num_top)
-        << "Must specify 'num' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.num_size() << ".";
-    CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-        << "Must specify 'channels' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.channels_size() << ".";
-    CHECK(param.height_size() == 1 || param.height_size() == num_top)
-        << "Must specify 'height' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.height_size() << ".";
-    CHECK(param.width_size() == 1 || param.width_size() == num_top)
-        << "Must specify 'width' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.width_size() << ".";
-  } else {
-    CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
-        << "Must specify 'shape' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.shape_size() << ".";
-  }
-  // refill_[i] tells Forward i whether or not to actually refill top Blob i.
-  // If refill_[i] is false, Forward does nothing for Blob i. We use this to
-  // avoid wastefully refilling "constant" Blobs in every forward pass.
-  // We first fill refill_ in with the INVERSE of its final values.
-  // The first time we run Forward from the LayerSetUp method, we'll fill only
-  // Blobs for which refill_ is normally false.  These Blobs will never be
-  // filled again.
-  refill_.clear();
-  fillers_.clear();
-  if (num_data_filler <= 1) {
-    FillerParameter filler_param;
-    if (num_data_filler == 0) {
-      filler_param.set_type("constant");
-      filler_param.set_value(0);
-    } else {
-      filler_param.CopyFrom(param.data_filler(0));
-    }
-    // Refill on each iteration iff not using a constant filler,
-    // but use the inverse of this rule for the first run.
-    refill_.resize(1);
-    refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
-    fillers_.resize(1);
-    fillers_[0].reset(GetFiller<Dtype>(filler_param));
-  } else {
-    refill_.resize(num_top);
-    fillers_.resize(num_top);
-    for (int i = 0; i < num_top; ++i) {
-      fillers_[i].reset(GetFiller<Dtype>(param.data_filler(i)));
-      // Refill on each iteration iff not using a constant filler,
-      // but use the inverse of this rule for the first run.
-      refill_[i] =
-          (strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
-    }
-  }
-  for (int i = 0; i < num_top; ++i) {
-    if (legacy_dims) {
-      const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
-      const int channels =
-          (param.channels_size() == 1) ? param.channels(0) : param.channels(i);
-      const int height =
-          (param.height_size() == 1) ? param.height(0) : param.height(i);
-      const int width =
-          (param.width_size() == 1) ? param.width(0) : param.width(i);
-      top[i]->Reshape(num, channels, height, width);
-    } else {
-      const int shape_index = (param.shape_size() == 1) ? 0 : i;
-      top[i]->Reshape(param.shape(shape_index));
-    }
-  }
-  // Run Forward once, with refill_ inverted, to fill the constant Blobs.
-  this->Forward(bottom, top);
-  // Invert the inverted refill_ values to refill the desired (non-constant)
-  // Blobs in every usual forward pass.
-  for (int i = 0; i < refill_.size(); ++i) {
-    refill_[i] = !refill_[i];
-  }
+	const bool legacy_dims = param.num_size() || param.channels_size() ||
+		param.height_size() || param.width_size();
+	if (legacy_dims) {
+		CHECK_EQ(0, param.shape_size())
+			<< "Both shape and legacy fields were specified";
+		// Using deprecated 4D output dim specifiers.
+		CHECK(param.num_size() == 1 || param.num_size() == num_top)
+			<< "Must specify 'num' once, or once per top blob "
+			<< "(" << num_top << "); specified " << param.num_size() << ".";
+		CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
+			<< "Must specify 'channels' once, or once per top blob "
+			<< "(" << num_top << "); specified " << param.channels_size() << ".";
+		CHECK(param.height_size() == 1 || param.height_size() == num_top)
+			<< "Must specify 'height' once, or once per top blob "
+			<< "(" << num_top << "); specified " << param.height_size() << ".";
+		CHECK(param.width_size() == 1 || param.width_size() == num_top)
+			<< "Must specify 'width' once, or once per top blob "
+			<< "(" << num_top << "); specified " << param.width_size() << ".";
+	} else {
+		CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
+			<< "Must specify 'shape' once, or once per top blob "
+			<< "(" << num_top << "); specified " << param.shape_size() << ".";
+	}
+	// refill_[i] tells Forward i whether or not to actually refill top Blob i.
+	// If refill_[i] is false, Forward does nothing for Blob i. We use this to
+	// avoid wastefully refilling "constant" Blobs in every forward pass.
+	// We first fill refill_ in with the INVERSE of its final values.
+	// The first time we run Forward from the LayerSetUp method, we'll fill only
+	// Blobs for which refill_ is normally false.  These Blobs will never be
+	// filled again.
+	refill_.clear();
+	fillers_.clear();
+	if (num_data_filler <= 1) {
+		FillerParameter filler_param;
+		if (num_data_filler == 0) {
+			filler_param.set_type("constant");
+			filler_param.set_value(0);
+		} else {
+			filler_param.CopyFrom(param.data_filler(0));
+		}
+		// Refill on each iteration iff not using a constant filler,
+		// but use the inverse of this rule for the first run.
+		refill_.resize(1);
+		refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
+		fillers_.resize(1);
+		fillers_[0].reset(GetFiller < Dtype > (filler_param));
+	} else {
+		refill_.resize(num_top);
+		fillers_.resize(num_top);
+		for (int i = 0; i < num_top; ++i) {
+			fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i)));
+			// Refill on each iteration iff not using a constant filler,
+			// but use the inverse of this rule for the first run.
+			refill_[i] =
+				(strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
+		}
+	}
+	for (int i = 0; i < num_top; ++i) {
+		if (legacy_dims) {
+			const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
+			const int channels =
+				(param.channels_size() == 1) ? param.channels(0) : param.channels(i);
+			const int height =
+				(param.height_size() == 1) ? param.height(0) : param.height(i);
+			const int width =
+				(param.width_size() == 1) ? param.width(0) : param.width(i);
+			top[i]->Reshape(num, channels, height, width);
+		} else {
+			const int shape_index = (param.shape_size() == 1) ? 0 : i;
+			top[i]->Reshape(param.shape(shape_index));
+		}
+	}
+	// Run Forward once, with refill_ inverted, to fill the constant Blobs.
+	this->Forward(bottom, top);
+	// Invert the inverted refill_ values to refill the desired (non-constant)
+	// Blobs in every usual forward pass.
+	for (int i = 0; i < refill_.size(); ++i) {
+		refill_[i] = !refill_[i];
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    const int filler_id = (fillers_.size() > 1) ? i : 0;
-    if (refill_[filler_id]) {
-      fillers_[filler_id]->Fill(top[i]);
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	for (int i = 0; i < top.size(); ++i) {
+		const int filler_id = (fillers_.size() > 1) ? i : 0;
+		if (refill_[filler_id]) {
+			fillers_[filler_id]->Fill(top[i]);
+		}
+	}
 }
 
-INSTANTIATE_CLASS(DummyDataLayer);
-REGISTER_LAYER_CLASS(DummyData);
+INSTANTIATE_CLASS (DummyDataLayer);
+REGISTER_LAYER_CLASS (DummyData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 5a7e5e74..45126d44 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -7,239 +7,244 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-      || this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
-      "Eltwise Layer takes one coefficient per bottom blob.";
-  CHECK(!(this->layer_param().eltwise_param().operation()
-      == EltwiseParameter_EltwiseOp_PROD
-      && this->layer_param().eltwise_param().coeff_size())) <<
-      "Eltwise layer only takes coefficients for summation.";
-  op_ = this->layer_param_.eltwise_param().operation();
-  // Blob-wise coefficients for the elementwise operation.
-  coeffs_ = vector<Dtype>(bottom.size(), 1);
-  if (this->layer_param().eltwise_param().coeff_size()) {
-    for (int i = 0; i < bottom.size(); ++i) {
-      coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
-    }
-  }
-  stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
+	const vector<Blob<Dtype>*>& top) {
+	CHECK(this->layer_param().eltwise_param().coeff_size() == 0
+		|| this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
+		"Eltwise Layer takes one coefficient per bottom blob.";
+	CHECK(!(this->layer_param().eltwise_param().operation()
+		== EltwiseParameter_EltwiseOp_PROD
+		&& this->layer_param().eltwise_param().coeff_size())) <<
+		"Eltwise layer only takes coefficients for summation.";
+	op_ = this->layer_param_.eltwise_param().operation();
+	// Blob-wise coefficients for the elementwise operation.
+	coeffs_ = vector < Dtype > (bottom.size(), 1);
+	if (this->layer_param().eltwise_param().coeff_size()) {
+		for (int i = 0; i < bottom.size(); ++i) {
+			coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
+		}
+	}
+	stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
-  }
-  top[0]->ReshapeLike(*bottom[0]);
-  // If max operation, we will initialize the vector index part.
-  if (this->layer_param_.eltwise_param().operation() ==
-      EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
-    max_idx_.Reshape(bottom[0]->shape());
-  }
+	const vector<Blob<Dtype>*>& top) {
+	for (int i = 1; i < bottom.size(); ++i) {
+		CHECK(bottom[i]->shape() == bottom[0]->shape());
+	}
+	top[0]->ReshapeLike(*bottom[0]);
+	// If max operation, we will initialize the vector index part.
+	if (this->layer_param_.eltwise_param().operation() ==
+		EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
+		max_idx_.Reshape(bottom[0]->shape());
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const Dtype* bottom_data_a = NULL;
-  const Dtype* bottom_data_b = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_set(count, Dtype(0), top_data);
-    // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    // Initialize
-    mask = max_idx_.mutable_cpu_data();
-    caffe_set(count, -1, mask);
-    caffe_set(count, Dtype(-FLT_MAX), top_data);
-    // bottom 0 & 1
-    bottom_data_a = bottom[0]->cpu_data();
-    bottom_data_b = bottom[1]->cpu_data();
-    for (int idx = 0; idx < count; ++idx) {
-      if (bottom_data_a[idx] > bottom_data_b[idx]) {
-        top_data[idx] = bottom_data_a[idx];  // maxval
-        mask[idx] = 0;  // maxid
-      } else {
-        top_data[idx] = bottom_data_b[idx];  // maxval
-        mask[idx] = 1;  // maxid
-      }
-    }
-    // bottom 2++
-    for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {
-      bottom_data_b = bottom[blob_idx]->cpu_data();
-      for (int idx = 0; idx < count; ++idx) {
-        if (bottom_data_b[idx] > top_data[idx]) {
-          top_data[idx] = bottom_data_b[idx];  // maxval
-          mask[idx] = blob_idx;  // maxid
-        }
-      }
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	int* mask = NULL;
+	const Dtype* bottom_data_a = NULL;
+	const Dtype* bottom_data_b = NULL;
+	const int count = top[0]->count();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	switch (op_) {
+		case EltwiseParameter_EltwiseOp_PROD:
+			caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data);
+			for (int i = 2; i < bottom.size(); ++i) {
+				caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data);
+			}
+			break;
+		case EltwiseParameter_EltwiseOp_SUM:
+			caffe_set(count, Dtype(0), top_data);
+			// TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
+			for (int i = 0; i < bottom.size(); ++i) {
+				caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+			}
+			break;
+		case EltwiseParameter_EltwiseOp_MAX:
+			// Initialize
+			mask = max_idx_.mutable_cpu_data();
+			caffe_set(count, -1, mask);
+			caffe_set(count, Dtype(-FLT_MAX), top_data);
+			// bottom 0 & 1
+			bottom_data_a = bottom[0]->cpu_data();
+			bottom_data_b = bottom[1]->cpu_data();
+			for (int idx = 0; idx < count; ++idx) {
+				if (bottom_data_a[idx] > bottom_data_b[idx]) {
+					top_data[idx] = bottom_data_a[idx];  // maxval
+					mask[idx] = 0;  // maxid
+				} else {
+					top_data[idx] = bottom_data_b[idx];  // maxval
+					mask[idx] = 1;  // maxid
+				}
+			}
+			// bottom 2++
+			for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {
+				bottom_data_b = bottom[blob_idx]->cpu_data();
+				for (int idx = 0; idx < count; ++idx) {
+					if (bottom_data_b[idx] > top_data[idx]) {
+						top_data[idx] = bottom_data_b[idx];  // maxval
+						mask[idx] = blob_idx;  // maxid
+					}
+				}
+			}
+			break;
+		default:
+			LOG(FATAL) << "Unknown elementwise operation.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* top_diff = top[0]->cpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->cpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_copy(count, bottom[j]->cpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_mul(count, bottom[j]->cpu_data(), bottom_diff,
-                        bottom_diff);
-            }
-          }
-        } else {
-          caffe_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.cpu_data();
-        for (int index = 0; index < count; ++index) {
-          Dtype gradient = 0;
-          if (mask[index] == i) {
-            gradient += top_diff[index];
-          }
-          bottom_diff[index] = gradient;
-        }
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const int* mask = NULL;
+	const int count = top[0]->count();
+	const Dtype* top_data = top[0]->cpu_data();
+	const Dtype* top_diff = top[0]->cpu_diff();
+	for (int i = 0; i < bottom.size(); ++i) {
+		if (propagate_down[i]) {
+			const Dtype* bottom_data = bottom[i]->cpu_data();
+			Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+			switch (op_) {
+				case EltwiseParameter_EltwiseOp_PROD:
+					if (stable_prod_grad_) {
+						bool initialized = false;
+						for (int j = 0; j < bottom.size(); ++j) {
+							if (i == j) {
+								continue;
+							}
+							if (!initialized) {
+								caffe_copy(count, bottom[j]->cpu_data(), bottom_diff);
+								initialized = true;
+							} else {
+								caffe_mul(count, bottom[j]->cpu_data(), bottom_diff,
+									bottom_diff);
+							}
+						}
+					} else {
+						caffe_div(count, top_data, bottom_data, bottom_diff);
+					}
+					caffe_mul(count, bottom_diff, top_diff, bottom_diff);
+					break;
+				case EltwiseParameter_EltwiseOp_SUM:
+					if (coeffs_[i] == Dtype(1)) {
+						caffe_copy(count, top_diff, bottom_diff);
+					} else {
+						caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+					}
+					break;
+				case EltwiseParameter_EltwiseOp_MAX:
+					mask = max_idx_.cpu_data();
+					for (int index = 0; index < count; ++index) {
+						Dtype gradient = 0;
+						if (mask[index] == i) {
+							gradient += top_diff[index];
+						}
+						bottom_diff[index] = gradient;
+					}
+					break;
+				default:
+					LOG(FATAL) << "Unknown elementwise operation.";
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-        top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    mask = max_idx_.mutable_gpu_data();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
-    for (int i = 2; i < bottom.size(); ++i) {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      MaxForward(count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	int* mask = NULL;
+	const int count = top[0]->count();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	switch (op_) {
+		case EltwiseParameter_EltwiseOp_PROD:
+			caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+				top_data);
+			for (int i = 2; i < bottom.size(); ++i) {
+				caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
+			}
+			break;
+		case EltwiseParameter_EltwiseOp_SUM:
+			caffe_gpu_set(count, Dtype(0.), top_data);
+			// TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
+			for (int i = 0; i < bottom.size(); ++i) {
+				caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+			}
+			break;
+		case EltwiseParameter_EltwiseOp_MAX:
+			mask = max_idx_.mutable_gpu_data();
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0,
+				top_data, mask);
+			for (int i = 2; i < bottom.size(); ++i) {
+				// NOLINT_NEXT_LINE(whitespace/operators)
+				MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data,
+					mask);
+			}
+			break;
+		default:
+			LOG(FATAL) << "Unknown elementwise operation.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-                            bottom_diff);
-            }
-          }
-        } else {
-          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_gpu_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.gpu_data();
-        MaxBackward(count, top_diff, i, mask, bottom_diff);
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const int* mask = NULL;
+	const int count = top[0]->count();
+	const Dtype* top_data = top[0]->gpu_data();
+	const Dtype* top_diff = top[0]->gpu_diff();
+	for (int i = 0; i < bottom.size(); ++i) {
+		if (propagate_down[i]) {
+			const Dtype* bottom_data = bottom[i]->gpu_data();
+			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+			switch (op_) {
+				case EltwiseParameter_EltwiseOp_PROD:
+					if (stable_prod_grad_) {
+						bool initialized = false;
+						for (int j = 0; j < bottom.size(); ++j) {
+							if (i == j) {
+								continue;
+							}
+							if (!initialized) {
+								caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
+								initialized = true;
+							} else {
+								caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
+									bottom_diff);
+							}
+						}
+					} else {
+						caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+					}
+					caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+					break;
+				case EltwiseParameter_EltwiseOp_SUM:
+					if (coeffs_[i] == Dtype(1.)) {
+						caffe_gpu_copy(count, top_diff, bottom_diff);
+					} else {
+						caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+					}
+					break;
+				case EltwiseParameter_EltwiseOp_MAX:
+					mask = max_idx_.gpu_data();
+					MaxBackward(count, top_diff, i, mask, bottom_diff);
+					break;
+				default:
+					LOG(FATAL) << "Unknown elementwise operation.";
+			}
+		}
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(EltwiseLayer);
 #endif
 
-INSTANTIATE_CLASS(EltwiseLayer);
-REGISTER_LAYER_CLASS(Eltwise);
+INSTANTIATE_CLASS (EltwiseLayer);
+REGISTER_LAYER_CLASS (Eltwise);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index d1efe5bb..d5abc23f 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -7,83 +7,83 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void EuclideanLossLayer<Dtype>::Reshape(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
-      << "Inputs must have the same dimension.";
-  diff_.ReshapeLike(*bottom[0]);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::Reshape(bottom, top);
+	CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
+		<< "Inputs must have the same dimension.";
+	diff_.ReshapeLike(*bottom[0]);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_sub(
-      count,
-      bottom[0]->cpu_data(),
-      bottom[1]->cpu_data(),
-      diff_.mutable_cpu_data());
-  Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
+	const vector<Blob<Dtype>*>& top) {
+	int count = bottom[0]->count();
+	caffe_sub(
+		count,
+		bottom[0]->cpu_data(),
+		bottom[1]->cpu_data(),
+		diff_.mutable_cpu_data());
+	Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
+	Dtype loss = dot / bottom[0]->num() / Dtype(2);
+	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_cpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.cpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_cpu_diff());  // b
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	for (int i = 0; i < 2; ++i) {
+		if (propagate_down[i]) {
+			const Dtype sign = (i == 0) ? 1 : -1;
+			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+			caffe_cpu_axpby(
+				bottom[i]->count(),              // count
+				alpha,                              // alpha
+				diff_.cpu_data(),                   // a
+				Dtype(0),                           // beta
+				bottom[i]->mutable_cpu_diff());  // b
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
+	const vector<Blob<Dtype>*>& top) {
+	int count = bottom[0]->count();
+	caffe_gpu_sub(
+		count,
+		bottom[0]->gpu_data(),
+		bottom[1]->gpu_data(),
+		diff_.mutable_gpu_data());
+	Dtype dot;
+	caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
+	Dtype loss = dot / bottom[0]->num() / Dtype(2);
+	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_gpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_gpu_diff());  // b
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	for (int i = 0; i < 2; ++i) {
+		if (propagate_down[i]) {
+			const Dtype sign = (i == 0) ? 1 : -1;
+			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+			caffe_gpu_axpby(
+				bottom[i]->count(),              // count
+				alpha,                              // alpha
+				diff_.gpu_data(),                   // a
+				Dtype(0),                           // beta
+				bottom[i]->mutable_gpu_diff());  // b
+		}
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(EuclideanLossLayer);
 #endif
 
-INSTANTIATE_CLASS(EuclideanLossLayer);
-REGISTER_LAYER_CLASS(EuclideanLoss);
+INSTANTIATE_CLASS (EuclideanLossLayer);
+REGISTER_LAYER_CLASS (EuclideanLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 5e7819c0..8451b133 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -7,94 +7,98 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  const Dtype base = this->layer_param_.exp_param().base();
-  if (base != Dtype(-1)) {
-    CHECK_GT(base, 0) << "base must be strictly positive.";
-  }
-  // If base == -1, interpret the base as e and set log_base = 1 exactly.
-  // Otherwise, calculate its log explicitly.
-  const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-  CHECK(!isnan(log_base))
-      << "NaN result: log(base) = log(" << base << ") = " << log_base;
-  CHECK(!isinf(log_base))
-      << "Inf result: log(base) = log(" << base << ") = " << log_base;
-  const Dtype input_scale = this->layer_param_.exp_param().scale();
-  const Dtype input_shift = this->layer_param_.exp_param().shift();
-  inner_scale_ = log_base * input_scale;
-  outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	const Dtype base = this->layer_param_.exp_param().base();
+	if (base != Dtype(-1)) {
+		CHECK_GT(base, 0) << "base must be strictly positive.";
+	}
+	// If base == -1, interpret the base as e and set log_base = 1 exactly.
+	// Otherwise, calculate its log explicitly.
+	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
+	CHECK(!isnan(log_base))
+		<< "NaN result: log(base) = log(" << base << ") = " << log_base;
+	CHECK(!isinf(log_base))
+		<< "Inf result: log(base) = log(" << base << ") = " << log_base;
+	const Dtype input_scale = this->layer_param_.exp_param().scale();
+	const Dtype input_shift = this->layer_param_.exp_param().shift();
+	inner_scale_ = log_base * input_scale;
+	outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_exp(count, bottom_data, top_data);
-  } else {
-    caffe_cpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_scal(count, outer_scale_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	if (inner_scale_ == Dtype(1)) {
+		caffe_exp(count, bottom_data, top_data);
+	} else {
+		caffe_cpu_scale(count, inner_scale_, bottom_data, top_data);
+		caffe_exp(count, top_data, top_data);
+	}
+	if (outer_scale_ != Dtype(1)) {
+		caffe_scal(count, outer_scale_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  caffe_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_scal(count, inner_scale_, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	const int count = bottom[0]->count();
+	const Dtype* top_data = top[0]->cpu_data();
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	caffe_mul(count, top_data, top_diff, bottom_diff);
+	if (inner_scale_ != Dtype(1)) {
+		caffe_scal(count, inner_scale_, bottom_diff);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_gpu_exp(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_gpu_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, outer_scale_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	if (inner_scale_ == Dtype(1)) {
+		caffe_gpu_exp(count, bottom_data, top_data);
+	} else {
+		caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
+		caffe_gpu_exp(count, top_data, top_data);
+	}
+	if (outer_scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, outer_scale_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, inner_scale_, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	const int count = bottom[0]->count();
+	const Dtype* top_data = top[0]->gpu_data();
+	const Dtype* top_diff = top[0]->gpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
+	if (inner_scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, inner_scale_, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(ExpLayer);
 #endif
 
-INSTANTIATE_CLASS(ExpLayer);
-REGISTER_LAYER_CLASS(Exp);
+INSTANTIATE_CLASS (ExpLayer);
+REGISTER_LAYER_CLASS (Exp);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index c5f5e4dd..9fa26c80 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -7,180 +7,180 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(top.size(), bottom.size() - 1);
-  first_reshape_ = true;
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(top.size(), bottom.size() - 1);
+	first_reshape_ = true;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // bottom[0...k-1] are the blobs to filter
-  // bottom[last] is the "selector_blob"
-  int selector_index = bottom.size() - 1;
-  for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) {
-    CHECK_EQ(bottom[selector_index]->shape(i), 1)
-        << "Selector blob dimensions must be singletons (1), except the first";
-  }
-  for (int i = 0; i < bottom.size() - 1; ++i) {
-    CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) <<
-        "Each bottom should have the same 0th dimension as the selector blob";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	// bottom[0...k-1] are the blobs to filter
+	// bottom[last] is the "selector_blob"
+	int selector_index = bottom.size() - 1;
+	for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) {
+		CHECK_EQ(bottom[selector_index]->shape(i), 1)
+			<< "Selector blob dimensions must be singletons (1), except the first";
+	}
+	for (int i = 0; i < bottom.size() - 1; ++i) {
+		CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) <<
+			"Each bottom should have the same 0th dimension as the selector blob";
+	}
 
-  const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
-  indices_to_forward_.clear();
+	const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
+	indices_to_forward_.clear();
 
-  // look for non-zero elements in bottom[0]. Items of each bottom that
-  // have the same index as the items in bottom[0] with value == non-zero
-  // will be forwarded
-  for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) {
-    // we don't need an offset because item size == 1
-    const Dtype* tmp_data_selector = bottom_data_selector + item_id;
-    if (*tmp_data_selector) {
-      indices_to_forward_.push_back(item_id);
-    }
-  }
-  // only filtered items will be forwarded
-  int new_tops_num = indices_to_forward_.size();
-  // init
-  if (first_reshape_) {
-    new_tops_num = bottom[0]->shape(0);
-    first_reshape_ = false;
-  }
-  for (int t = 0; t < top.size(); ++t) {
-    int num_axes = bottom[t]->num_axes();
-    vector<int> shape_top(num_axes);
-    shape_top[0] = new_tops_num;
-    for (int ts = 1; ts < num_axes; ++ts)
-      shape_top[ts] = bottom[t]->shape(ts);
-    top[t]->Reshape(shape_top);
-  }
+	// look for non-zero elements in bottom[0]. Items of each bottom that
+	// have the same index as the items in bottom[0] with value == non-zero
+	// will be forwarded
+	for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) {
+		// we don't need an offset because item size == 1
+		const Dtype* tmp_data_selector = bottom_data_selector + item_id;
+		if (*tmp_data_selector) {
+			indices_to_forward_.push_back(item_id);
+		}
+	}
+	// only filtered items will be forwarded
+	int new_tops_num = indices_to_forward_.size();
+	// init
+	if (first_reshape_) {
+		new_tops_num = bottom[0]->shape(0);
+		first_reshape_ = false;
+	}
+	for (int t = 0; t < top.size(); ++t) {
+		int num_axes = bottom[t]->num_axes();
+		vector<int> shape_top(num_axes);
+		shape_top[0] = new_tops_num;
+		for (int ts = 1; ts < num_axes; ++ts)
+			shape_top[ts] = bottom[t]->shape(ts);
+		top[t]->Reshape(shape_top);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->cpu_data();
-    Dtype* top_data = top[t]->mutable_cpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1);
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	int new_tops_num = indices_to_forward_.size();
+	// forward all filtered items for all bottoms but the Selector (bottom[last])
+	for (int t = 0; t < top.size(); ++t) {
+		const Dtype* bottom_data = bottom[t]->cpu_data();
+		Dtype* top_data = top[t]->mutable_cpu_data();
+		int dim = bottom[t]->count() / bottom[t]->shape(0);
+		for (int n = 0; n < new_tops_num; ++n) {
+			int data_offset_top = n * dim;
+			int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1);
+			caffe_copy(dim, bottom_data + data_offset_bottom,
+				top_data + data_offset_top);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); i++) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); n++) {
-        data_offset_bottom = n * dim;
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          caffe_set(dim, Dtype(0),
-              bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_set(dim, Dtype(0),
-                bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            next_to_backward_offset++;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top,
-                bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[bottom.size() - 1]) {
+		LOG(FATAL) << this->type()
+			<< "Layer cannot backpropagate to filter index inputs";
+	}
+	for (int i = 0; i < top.size(); i++) {
+		// bottom[last] is the selector and never needs backpropagation
+		// so we can iterate over top vector because top.size() == bottom.size() -1
+		if (propagate_down[i]) {
+			const int dim = top[i]->count() / top[i]->shape(0);
+			int next_to_backward_offset = 0;
+			int batch_offset = 0;
+			int data_offset_bottom = 0;
+			int data_offset_top = 0;
+			for (int n = 0; n < bottom[i]->shape(0); n++) {
+				data_offset_bottom = n * dim;
+				if (next_to_backward_offset >= indices_to_forward_.size()) {
+					// we already visited all items that were been forwarded, so
+					// just set to zero remaining ones
+					caffe_set(dim, Dtype(0),
+						bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+				} else {
+					batch_offset = indices_to_forward_[next_to_backward_offset];
+					if (n != batch_offset) {  // this data was not been forwarded
+						caffe_set(dim, Dtype(0),
+							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+					} else {  // this data was been forwarded
+						data_offset_top = next_to_backward_offset * dim;
+						next_to_backward_offset++;  // point to next forwarded item index
+						caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top,
+							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+					}
+				}
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->gpu_data();
-    Dtype* top_data = top[t]->mutable_gpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * dim;
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	int new_tops_num = indices_to_forward_.size();
+	// forward all filtered items for all bottoms but the Selector (bottom[last])
+	for (int t = 0; t < top.size(); ++t) {
+		const Dtype* bottom_data = bottom[t]->gpu_data();
+		Dtype* top_data = top[t]->mutable_gpu_data();
+		int dim = bottom[t]->count() / bottom[t]->shape(0);
+		for (int n = 0; n < new_tops_num; ++n) {
+			int data_offset_top = n * dim;
+			int data_offset_bottom = indices_to_forward_[n] * dim;
+			caffe_copy(dim, bottom_data + data_offset_bottom,
+				top_data + data_offset_top);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); ++n) {
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          data_offset_bottom = n * dim;
-          caffe_gpu_set(dim, Dtype(0),
-              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          data_offset_bottom = n * dim;
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_gpu_set(dim, Dtype(0),
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            ++next_to_backward_offset;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[bottom.size() - 1]) {
+		LOG(FATAL) << this->type()
+			<< "Layer cannot backpropagate to filter index inputs";
+	}
+	for (int i = 0; i < top.size(); ++i) {
+		// bottom[last] is the selector and never needs backpropagation
+		// so we can iterate over top vector because top.size() == bottom.size() -1
+		if (propagate_down[i]) {
+			const int dim = top[i]->count() / top[i]->shape(0);
+			int next_to_backward_offset = 0;
+			int batch_offset = 0;
+			int data_offset_bottom = 0;
+			int data_offset_top = 0;
+			for (int n = 0; n < bottom[i]->shape(0); ++n) {
+				if (next_to_backward_offset >= indices_to_forward_.size()) {
+					// we already visited all items that were been forwarded, so
+					// just set to zero remaining ones
+					data_offset_bottom = n * dim;
+					caffe_gpu_set(dim, Dtype(0),
+						bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+				} else {
+					batch_offset = indices_to_forward_[next_to_backward_offset];
+					data_offset_bottom = n * dim;
+					if (n != batch_offset) {  // this data was not been forwarded
+						caffe_gpu_set(dim, Dtype(0),
+							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+					} else {  // this data was been forwarded
+						data_offset_top = next_to_backward_offset * dim;
+						++next_to_backward_offset;  // point to next forwarded item index
+						caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
+							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+					}
+				}
+			}
+		}
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(FilterLayer);
 #endif
 
-INSTANTIATE_CLASS(FilterLayer);
-REGISTER_LAYER_CLASS(Filter);
+INSTANTIATE_CLASS (FilterLayer);
+REGISTER_LAYER_CLASS (Filter);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index f7e5c9c2..4aaad3a4 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -6,39 +6,39 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int start_axis = bottom[0]->CanonicalAxisIndex(
-      this->layer_param_.flatten_param().axis());
-  const int end_axis = bottom[0]->CanonicalAxisIndex(
-      this->layer_param_.flatten_param().end_axis());
-  vector<int> top_shape;
-  for (int i = 0; i < start_axis; ++i) {
-    top_shape.push_back(bottom[0]->shape(i));
-  }
-  const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1);
-  top_shape.push_back(flattened_dim);
-  for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) {
-    top_shape.push_back(bottom[0]->shape(i));
-  }
-  top[0]->Reshape(top_shape);
-  CHECK_EQ(top[0]->count(), bottom[0]->count());
+	const vector<Blob<Dtype>*>& top) {
+	const int start_axis = bottom[0]->CanonicalAxisIndex(
+		this->layer_param_.flatten_param().axis());
+	const int end_axis = bottom[0]->CanonicalAxisIndex(
+		this->layer_param_.flatten_param().end_axis());
+	vector<int> top_shape;
+	for (int i = 0; i < start_axis; ++i) {
+		top_shape.push_back(bottom[0]->shape(i));
+	}
+	const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1);
+	top_shape.push_back(flattened_dim);
+	for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) {
+		top_shape.push_back(bottom[0]->shape(i));
+	}
+	top[0]->Reshape(top_shape);
+	CHECK_EQ(top[0]->count(), bottom[0]->count());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->ShareData(*bottom[0]);
+	const vector<Blob<Dtype>*>& top) {
+	top[0]->ShareData(*bottom[0]);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  bottom[0]->ShareDiff(*top[0]);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	bottom[0]->ShareDiff(*top[0]);
 }
 
-INSTANTIATE_CLASS(FlattenLayer);
-REGISTER_LAYER_CLASS(Flatten);
+INSTANTIATE_CLASS (FlattenLayer);
+REGISTER_LAYER_CLASS (Flatten);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index af223c0f..377755b9 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -1,11 +1,11 @@
 /*
-TODO:
-- load file in a separate thread ("prefetch")
-- can be smarter about the memcpy call instead of doing it row-by-row
-  :: use util functions caffe_copy, and Blob->offset()
-  :: don't forget to update hdf5_daa_layer.cu accordingly
-- add ability to shuffle filenames if flag is set
-*/
+ TODO:
+ - load file in a separate thread ("prefetch")
+ - can be smarter about the memcpy call instead of doing it row-by-row
+ :: use util functions caffe_copy, and Blob->offset()
+ :: don't forget to update hdf5_daa_layer.cu accordingly
+ - add ability to shuffle filenames if flag is set
+ */
 #include <fstream>  // NOLINT(readability/streams)
 #include <string>
 #include <vector>
@@ -20,182 +20,187 @@
 
 namespace caffe {
 
-template <typename Dtype>
-HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() { }
+template<typename Dtype>
+HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() {
+}
 
 // Load data and label from HDF5 filename into the class property blobs.
-template <typename Dtype>
+template<typename Dtype>
 void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
-  DLOG(INFO) << "Loading HDF5 file: " << filename;
-  hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
-  if (file_id < 0) {
-    LOG(FATAL) << "Failed opening HDF5 file: " << filename;
-  }
-
-  int top_size = this->layer_param_.top_size();
-  hdf_blobs_.resize(top_size);
-
-  const int MIN_DATA_DIM = 1;
-  const int MAX_DATA_DIM = INT_MAX;
-
-  for (int i = 0; i < top_size; ++i) {
-    hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
-    hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
-  }
-
-  herr_t status = H5Fclose(file_id);
-  CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename;
-
-  // MinTopBlobs==1 guarantees at least one top blob
-  CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis.";
-  const int num = hdf_blobs_[0]->shape(0);
-  for (int i = 1; i < top_size; ++i) {
-    CHECK_EQ(hdf_blobs_[i]->shape(0), num);
-  }
-  // Default to identity permutation.
-  data_permutation_.clear();
-  data_permutation_.resize(hdf_blobs_[0]->shape(0));
-  for (int i = 0; i < hdf_blobs_[0]->shape(0); i++)
-    data_permutation_[i] = i;
-
-  // Shuffle if needed.
-  if (this->layer_param_.hdf5_data_param().shuffle()) {
-    std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
-               << " rows (shuffled)";
-  } else {
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
-  }
+	DLOG(INFO) << "Loading HDF5 file: " << filename;
+	hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
+	if (file_id < 0) {
+		LOG(FATAL) << "Failed opening HDF5 file: " << filename;
+	}
+
+	int top_size = this->layer_param_.top_size();
+	hdf_blobs_.resize(top_size);
+
+	const int MIN_DATA_DIM = 1;
+	const int MAX_DATA_DIM = INT_MAX;
+
+	for (int i = 0; i < top_size; ++i) {
+		hdf_blobs_[i] = shared_ptr < Blob<Dtype> > (new Blob<Dtype>());
+		hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
+			MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+	}
+
+	herr_t status = H5Fclose(file_id);
+	CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename;
+
+	// MinTopBlobs==1 guarantees at least one top blob
+	CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis.";
+	const int num = hdf_blobs_[0]->shape(0);
+	for (int i = 1; i < top_size; ++i) {
+		CHECK_EQ(hdf_blobs_[i]->shape(0), num);
+	}
+	// Default to identity permutation.
+	data_permutation_.clear();
+	data_permutation_.resize(hdf_blobs_[0]->shape(0));
+	for (int i = 0; i < hdf_blobs_[0]->shape(0); i++)
+		data_permutation_[i] = i;
+
+	// Shuffle if needed.
+	if (this->layer_param_.hdf5_data_param().shuffle()) {
+		std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+			<< " rows (shuffled)";
+	} else {
+		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Refuse transformation parameters since HDF5 is totally generic.
-  CHECK(!this->layer_param_.has_transform_param()) <<
-      this->type() << " does not transform data.";
-  // Read the source to parse the filenames.
-  const string& source = this->layer_param_.hdf5_data_param().source();
-  LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
-  hdf_filenames_.clear();
-  std::ifstream source_file(source.c_str());
-  if (source_file.is_open()) {
-    std::string line;
-    while (source_file >> line) {
-      hdf_filenames_.push_back(line);
-    }
-  } else {
-    LOG(FATAL) << "Failed to open source file: " << source;
-  }
-  source_file.close();
-  num_files_ = hdf_filenames_.size();
-  current_file_ = 0;
-  LOG(INFO) << "Number of HDF5 files: " << num_files_;
-  CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
-    << source;
-
-  file_permutation_.clear();
-  file_permutation_.resize(num_files_);
-  // Default to identity permutation.
-  for (int i = 0; i < num_files_; i++) {
-    file_permutation_[i] = i;
-  }
-
-  // Shuffle if needed.
-  if (this->layer_param_.hdf5_data_param().shuffle()) {
-    std::random_shuffle(file_permutation_.begin(), file_permutation_.end());
-  }
-
-  // Load the first HDF5 file and initialize the line counter.
-  LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str());
-  current_row_ = 0;
-
-  // Reshape blobs.
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  const int top_size = this->layer_param_.top_size();
-  vector<int> top_shape;
-  for (int i = 0; i < top_size; ++i) {
-    top_shape.resize(hdf_blobs_[i]->num_axes());
-    top_shape[0] = batch_size;
-    for (int j = 1; j < top_shape.size(); ++j) {
-      top_shape[j] = hdf_blobs_[i]->shape(j);
-    }
-    top[i]->Reshape(top_shape);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	// Refuse transformation parameters since HDF5 is totally generic.
+	CHECK(!this->layer_param_.has_transform_param()) <<
+		this->type() << " does not transform data.";
+	// Read the source to parse the filenames.
+	const string& source = this->layer_param_.hdf5_data_param().source();
+	LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
+	hdf_filenames_.clear();
+	std::ifstream source_file(source.c_str());
+	if (source_file.is_open()) {
+		std::string line;
+		while (source_file >> line) {
+			hdf_filenames_.push_back(line);
+		}
+	} else {
+		LOG(FATAL) << "Failed to open source file: " << source;
+	}
+	source_file.close();
+	num_files_ = hdf_filenames_.size();
+	current_file_ = 0;
+	LOG(INFO) << "Number of HDF5 files: " << num_files_;
+	CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
+		<< source;
+
+	file_permutation_.clear();
+	file_permutation_.resize(num_files_);
+	// Default to identity permutation.
+	for (int i = 0; i < num_files_; i++) {
+		file_permutation_[i] = i;
+	}
+
+	// Shuffle if needed.
+	if (this->layer_param_.hdf5_data_param().shuffle()) {
+		std::random_shuffle(file_permutation_.begin(), file_permutation_.end());
+	}
+
+	// Load the first HDF5 file and initialize the line counter.
+	LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str());
+	current_row_ = 0;
+
+	// Reshape blobs.
+	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+	const int top_size = this->layer_param_.top_size();
+	vector<int> top_shape;
+	for (int i = 0; i < top_size; ++i) {
+		top_shape.resize(hdf_blobs_[i]->num_axes());
+		top_shape[0] = batch_size;
+		for (int j = 1; j < top_shape.size(); ++j) {
+			top_shape[j] = hdf_blobs_[i]->shape(j);
+		}
+		top[i]->Reshape(top_shape);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        ++current_file_;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    }
-    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-      int data_dim = top[j]->count() / top[j]->shape(0);
-      caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+	for (int i = 0; i < batch_size; ++i, ++current_row_) {
+		if (current_row_ == hdf_blobs_[0]->shape(0)) {
+			if (num_files_ > 1) {
+				++current_file_;
+				if (current_file_ == num_files_) {
+					current_file_ = 0;
+					if (this->layer_param_.hdf5_data_param().shuffle()) {
+						std::random_shuffle(file_permutation_.begin(),
+							file_permutation_.end());
+					}
+					DLOG(INFO) << "Looping around to first file.";
+				}
+				LoadHDF5FileData(
+					hdf_filenames_[file_permutation_[current_file_]].c_str());
+			}
+			current_row_ = 0;
+			if (this->layer_param_.hdf5_data_param().shuffle())
+				std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+		}
+		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+			int data_dim = top[j]->count() / top[j]->shape(0);
+			caffe_copy(data_dim,
+				&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+					* data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    }
-    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-      int data_dim = top[j]->count() / top[j]->shape(0);
-      OCL_CHECK( clEnqueueWriteBuffer (amdDevice.CommandQueue, (cl_mem)top[j]->mutable_gpu_data(), CL_TRUE, i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], 0, NULL, NULL) ); 
-      //caffe_copy(data_dim,
-      //    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-      //      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+	for (int i = 0; i < batch_size; ++i, ++current_row_) {
+		if (current_row_ == hdf_blobs_[0]->shape(0)) {
+			if (num_files_ > 1) {
+				current_file_ += 1;
+				if (current_file_ == num_files_) {
+					current_file_ = 0;
+					if (this->layer_param_.hdf5_data_param().shuffle()) {
+						std::random_shuffle(file_permutation_.begin(),
+							file_permutation_.end());
+					}
+					DLOG(INFO) << "Looping around to first file.";
+				}
+				LoadHDF5FileData(
+					hdf_filenames_[file_permutation_[current_file_]].c_str());
+			}
+			current_row_ = 0;
+			if (this->layer_param_.hdf5_data_param().shuffle())
+				std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+		}
+		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+			int data_dim = top[j]->count() / top[j]->shape(0);
+			OCL_CHECK(
+				clEnqueueWriteBuffer(amdDevice.CommandQueue,
+					(cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
+					i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
+					&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim],
+					0, NULL, NULL));
+			//caffe_copy(data_dim,
+			//    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+			//      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
+		}
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(HDF5DataLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(HDF5DataLayer);
-REGISTER_LAYER_CLASS(HDF5Data);
+INSTANTIATE_CLASS (HDF5DataLayer);
+REGISTER_LAYER_CLASS (HDF5Data);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index e2bd8e4c..cbb8a6fe 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -11,92 +11,100 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  file_name_ = this->layer_param_.hdf5_output_param().file_name();
-  file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
-                       H5P_DEFAULT);
-  CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
-  file_opened_ = true;
+	const vector<Blob<Dtype>*>& top) {
+	file_name_ = this->layer_param_.hdf5_output_param().file_name();
+	file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
+		H5P_DEFAULT);
+	CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
+	file_opened_ = true;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 HDF5OutputLayer<Dtype>::~HDF5OutputLayer<Dtype>() {
-  if (file_opened_) {
-    herr_t status = H5Fclose(file_id_);
-    CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_;
-  }
+	if (file_opened_) {
+		herr_t status = H5Fclose(file_id_);
+		CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::SaveBlobs() {
-  // TODO: no limit on the number of blobs
-  LOG(INFO) << "Saving HDF5 file " << file_name_;
-  CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
-      "data blob and label blob must have the same batch size";
-  hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
-  hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
-  LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
+	// TODO: no limit on the number of blobs
+	LOG(INFO) << "Saving HDF5 file " << file_name_;
+	CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
+		"data blob and label blob must have the same batch size";
+	hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
+	hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
+	LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_GE(bottom.size(), 2);
+	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		bottom[0]->height(), bottom[0]->width());
+	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+		bottom[1]->height(), bottom[1]->width());
+	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim],
-        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-    caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim],
-        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
-  }
-  SaveBlobs();
+	for (int i = 0; i < bottom[0]->num(); ++i) {
+		caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim],
+			&data_blob_.mutable_cpu_data()[i * data_datum_dim]);
+		caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim],
+			&label_blob_.mutable_cpu_data()[i * label_datum_dim]);
+	}
+	SaveBlobs();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  return;
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	return;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_GE(bottom.size(), 2);
+	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		bottom[0]->height(), bottom[0]->width());
+	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+		bottom[1]->height(), bottom[1]->width());
+	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[0]->gpu_data(), CL_TRUE, i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
-    OCL_CHECK (clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)bottom[1]->gpu_data(), CL_TRUE, i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim, &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL));
-  }
-  SaveBlobs();
+	for (int i = 0; i < bottom[0]->num(); ++i) {
+		OCL_CHECK(
+			clEnqueueReadBuffer(amdDevice.CommandQueue,
+				(cl_mem) bottom[0]->gpu_data(), CL_TRUE,
+				i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
+				&data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
+		OCL_CHECK(
+			clEnqueueReadBuffer(amdDevice.CommandQueue,
+				(cl_mem) bottom[1]->gpu_data(), CL_TRUE,
+				i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim,
+				&label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL));
+	}
+	SaveBlobs();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  return;
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	return;
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(HDF5OutputLayer);
 #endif
 
-INSTANTIATE_CLASS(HDF5OutputLayer);
-REGISTER_LAYER_CLASS(HDF5Output);
+INSTANTIATE_CLASS (HDF5OutputLayer);
+REGISTER_LAYER_CLASS (HDF5Output);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index a2fb2a18..e01e1d6a 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -10,73 +10,73 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  const Dtype* label = bottom[1]->cpu_data();
-  int num = bottom[0]->num();
-  int count = bottom[0]->count();
-  int dim = count / num;
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	const Dtype* label = bottom[1]->cpu_data();
+	int num = bottom[0]->num();
+	int count = bottom[0]->count();
+	int dim = count / num;
 
-  caffe_copy(count, bottom_data, bottom_diff);
-  for (int i = 0; i < num; ++i) {
-    bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
-  }
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < dim; ++j) {
-      bottom_diff[i * dim + j] = std::max(
-        Dtype(0), 1 + bottom_diff[i * dim + j]);
-    }
-  }
-  Dtype* loss = top[0]->mutable_cpu_data();
-  switch (this->layer_param_.hinge_loss_param().norm()) {
-  case HingeLossParameter_Norm_L1:
-    loss[0] = caffe_cpu_asum(count, bottom_diff) / num;
-    break;
-  case HingeLossParameter_Norm_L2:
-    loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num;
-    break;
-  default:
-    LOG(FATAL) << "Unknown Norm";
-  }
+	caffe_copy(count, bottom_data, bottom_diff);
+	for (int i = 0; i < num; ++i) {
+		bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+	}
+	for (int i = 0; i < num; ++i) {
+		for (int j = 0; j < dim; ++j) {
+			bottom_diff[i * dim + j] = std::max(
+				Dtype(0), 1 + bottom_diff[i * dim + j]);
+		}
+	}
+	Dtype* loss = top[0]->mutable_cpu_data();
+	switch (this->layer_param_.hinge_loss_param().norm()) {
+		case HingeLossParameter_Norm_L1:
+			loss[0] = caffe_cpu_asum(count, bottom_diff) / num;
+			break;
+		case HingeLossParameter_Norm_L2:
+			loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num;
+			break;
+		default:
+			LOG(FATAL) << "Unknown Norm";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const Dtype* label = bottom[1]->cpu_data();
-    int num = bottom[0]->num();
-    int count = bottom[0]->count();
-    int dim = count / num;
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const Dtype* label = bottom[1]->cpu_data();
+		int num = bottom[0]->num();
+		int count = bottom[0]->count();
+		int dim = count / num;
 
-    for (int i = 0; i < num; ++i) {
-      bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
-    }
+		for (int i = 0; i < num; ++i) {
+			bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+		}
 
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    switch (this->layer_param_.hinge_loss_param().norm()) {
-    case HingeLossParameter_Norm_L1:
-      caffe_cpu_sign(count, bottom_diff, bottom_diff);
-      caffe_scal(count, loss_weight / num, bottom_diff);
-      break;
-    case HingeLossParameter_Norm_L2:
-      caffe_scal(count, loss_weight * 2 / num, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown Norm";
-    }
-  }
+		const Dtype loss_weight = top[0]->cpu_diff()[0];
+		switch (this->layer_param_.hinge_loss_param().norm()) {
+			case HingeLossParameter_Norm_L1:
+				caffe_cpu_sign(count, bottom_diff, bottom_diff);
+				caffe_scal(count, loss_weight / num, bottom_diff);
+				break;
+			case HingeLossParameter_Norm_L2:
+				caffe_scal(count, loss_weight * 2 / num, bottom_diff);
+				break;
+			default:
+				LOG(FATAL) << "Unknown Norm";
+		}
+	}
 }
 
-INSTANTIATE_CLASS(HingeLossLayer);
-REGISTER_LAYER_CLASS(HingeLoss);
+INSTANTIATE_CLASS (HingeLossLayer);
+REGISTER_LAYER_CLASS (HingeLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 7b667172..b29e47e2 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -7,115 +7,113 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  if (conv_param.has_kernel_size()) {
-    kernel_h_ = kernel_w_ = conv_param.kernel_size();
-  } else {
-    kernel_h_ = conv_param.kernel_h();
-    kernel_w_ = conv_param.kernel_w();
-  }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!conv_param.has_pad_h()) {
-    pad_h_ = pad_w_ = conv_param.pad();
-  } else {
-    pad_h_ = conv_param.pad_h();
-    pad_w_ = conv_param.pad_w();
-  }
-  if (!conv_param.has_stride_h()) {
-    stride_h_ = stride_w_ = conv_param.stride();
-  } else {
-    stride_h_ = conv_param.stride_h();
-    stride_w_ = conv_param.stride_w();
-  }
+	const vector<Blob<Dtype>*>& top) {
+	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+	CHECK(!conv_param.has_kernel_size() !=
+		!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+		<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+	CHECK(conv_param.has_kernel_size() ||
+		(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+		<< "For non-square filters both kernel_h and kernel_w are required.";
+	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
+		&& conv_param.has_pad_w())
+		|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+		<< "pad is pad OR pad_h and pad_w are required.";
+	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
+		&& conv_param.has_stride_w())
+		|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+		<< "Stride is stride OR stride_h and stride_w are required.";
+	if (conv_param.has_kernel_size()) {
+		kernel_h_ = kernel_w_ = conv_param.kernel_size();
+	} else {
+		kernel_h_ = conv_param.kernel_h();
+		kernel_w_ = conv_param.kernel_w();
+	}
+	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+	if (!conv_param.has_pad_h()) {
+		pad_h_ = pad_w_ = conv_param.pad();
+	} else {
+		pad_h_ = conv_param.pad_h();
+		pad_w_ = conv_param.pad_w();
+	}
+	if (!conv_param.has_stride_h()) {
+		stride_h_ = stride_w_ = conv_param.stride();
+	} else {
+		stride_h_ = conv_param.stride_h();
+		stride_w_ = conv_param.stride_w();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  top[0]->Reshape(
-      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
-      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
-      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	channels_ = bottom[0]->channels();
+	height_ = bottom[0]->height();
+	width_ = bottom[0]->width();
+	top[0]->Reshape(
+		bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+		(height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+		(width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	for (int n = 0; n < bottom[0]->num(); ++n) {
+		im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
+			width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+			stride_h_, stride_w_, top_data + top[0]->offset(n));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	for (int n = 0; n < top[0]->num(); ++n) {
+		col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
+			kernel_h_, kernel_w_, pad_h_, pad_w_,
+			stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data, top[0]->offset(n));
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	for (int n = 0; n < bottom[0]->num(); ++n) {
+		im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_,
+			width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+			stride_h_, stride_w_, top_data, top[0]->offset(n));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->gpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	for (int n = 0; n < top[0]->num(); ++n) {
+		col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
+			kernel_h_, kernel_w_, pad_h_, pad_w_,
+			stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
+	}
 }
 
-
-
 #ifdef CPU_ONLY
 STUB_GPU(Im2colLayer);
 #endif
 
-INSTANTIATE_CLASS(Im2colLayer);
-REGISTER_LAYER_CLASS(Im2col);
+INSTANTIATE_CLASS (Im2colLayer);
+REGISTER_LAYER_CLASS (Im2col);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 18c035cb..846bcc34 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -15,145 +15,145 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+	this->JoinPrefetchThread();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int new_height = this->layer_param_.image_data_param().new_height();
-  const int new_width  = this->layer_param_.image_data_param().new_width();
-  const bool is_color  = this->layer_param_.image_data_param().is_color();
-  string root_folder = this->layer_param_.image_data_param().root_folder();
-
-  CHECK((new_height == 0 && new_width == 0) ||
-      (new_height > 0 && new_width > 0)) << "Current implementation requires "
-      "new_height and new_width to be set at the same time.";
-  // Read the file with filenames and labels
-  const string& source = this->layer_param_.image_data_param().source();
-  LOG(INFO) << "Opening file " << source;
-  std::ifstream infile(source.c_str());
-  string filename;
-  int label;
-  while (infile >> filename >> label) {
-    lines_.push_back(std::make_pair(filename, label));
-  }
-
-  if (this->layer_param_.image_data_param().shuffle()) {
-    // randomly shuffle data
-    LOG(INFO) << "Shuffling data";
-    const unsigned int prefetch_rng_seed = caffe_rng_rand();
-    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-    ShuffleImages();
-  }
-  LOG(INFO) << "A total of " << lines_.size() << " images.";
-
-  lines_id_ = 0;
-  // Check if we would need to randomly skip a few data points
-  if (this->layer_param_.image_data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-        this->layer_param_.image_data_param().rand_skip();
-    LOG(INFO) << "Skipping first " << skip << " data points.";
-    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
-    lines_id_ = skip;
-  }
-  // Read an image, and use it to initialize the top blob.
-  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-                                    new_height, new_width, is_color);
-  // Use data_transformer to infer the expected blob shape from a cv_image.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data and top[0] according to the batch_size.
-  const int batch_size = this->layer_param_.image_data_param().batch_size();
-  top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
-  top[0]->ReshapeLike(this->prefetch_data_);
-
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  vector<int> label_shape(1, batch_size);
-  top[1]->Reshape(label_shape);
-  this->prefetch_label_.Reshape(label_shape);
+	const vector<Blob<Dtype>*>& top) {
+	const int new_height = this->layer_param_.image_data_param().new_height();
+	const int new_width = this->layer_param_.image_data_param().new_width();
+	const bool is_color = this->layer_param_.image_data_param().is_color();
+	string root_folder = this->layer_param_.image_data_param().root_folder();
+
+	CHECK((new_height == 0 && new_width == 0) ||
+		(new_height > 0 && new_width > 0)) << "Current implementation requires "
+		"new_height and new_width to be set at the same time.";
+	// Read the file with filenames and labels
+	const string& source = this->layer_param_.image_data_param().source();
+	LOG(INFO) << "Opening file " << source;
+	std::ifstream infile(source.c_str());
+	string filename;
+	int label;
+	while (infile >> filename >> label) {
+		lines_.push_back(std::make_pair(filename, label));
+	}
+
+	if (this->layer_param_.image_data_param().shuffle()) {
+		// randomly shuffle data
+		LOG(INFO) << "Shuffling data";
+		const unsigned int prefetch_rng_seed = caffe_rng_rand();
+		prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+		ShuffleImages();
+	}
+	LOG(INFO) << "A total of " << lines_.size() << " images.";
+
+	lines_id_ = 0;
+	// Check if we would need to randomly skip a few data points
+	if (this->layer_param_.image_data_param().rand_skip()) {
+		unsigned int skip = caffe_rng_rand() %
+			this->layer_param_.image_data_param().rand_skip();
+		LOG(INFO) << "Skipping first " << skip << " data points.";
+		CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
+		lines_id_ = skip;
+	}
+	// Read an image, and use it to initialize the top blob.
+	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+		new_height, new_width, is_color);
+	// Use data_transformer to infer the expected blob shape from a cv_image.
+	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
+	this->transformed_data_.Reshape(top_shape);
+	// Reshape prefetch_data and top[0] according to the batch_size.
+	const int batch_size = this->layer_param_.image_data_param().batch_size();
+	top_shape[0] = batch_size;
+	this->prefetch_data_.Reshape(top_shape);
+	top[0]->ReshapeLike(this->prefetch_data_);
+
+	LOG(INFO) << "output data size: " << top[0]->num() << ","
+		<< top[0]->channels() << "," << top[0]->height() << ","
+		<< top[0]->width();
+	// label
+	vector<int> label_shape(1, batch_size);
+	top[1]->Reshape(label_shape);
+	this->prefetch_label_.Reshape(label_shape);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ImageDataLayer<Dtype>::ShuffleImages() {
-  caffe::rng_t* prefetch_rng =
-      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
+	caffe::rng_t* prefetch_rng =
+		static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+	shuffle(lines_.begin(), lines_.end(), prefetch_rng);
 }
 
 // This function is used to create a thread that prefetches the data.
-template <typename Dtype>
+template<typename Dtype>
 void ImageDataLayer<Dtype>::InternalThreadEntry() {
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  CHECK(this->prefetch_data_.count());
-  CHECK(this->transformed_data_.count());
-  ImageDataParameter image_data_param = this->layer_param_.image_data_param();
-  const int batch_size = image_data_param.batch_size();
-  const int new_height = image_data_param.new_height();
-  const int new_width = image_data_param.new_width();
-  const bool is_color = image_data_param.is_color();
-  string root_folder = image_data_param.root_folder();
-
-  // Reshape according to the first image of each batch
-  // on single input batches allows for inputs of varying dimension.
-  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-      new_height, new_width, is_color);
-  // Use data_transformer to infer the expected blob shape from a cv_img.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data according to the batch_size.
-  top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
-
-  Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
-  Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
-
-  // datum scales
-  const int lines_size = lines_.size();
-  for (int item_id = 0; item_id < batch_size; ++item_id) {
-    // get a blob
-    timer.Start();
-    CHECK_GT(lines_size, lines_id_);
-    cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-        new_height, new_width, is_color);
-    CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
-    read_time += timer.MicroSeconds();
-    timer.Start();
-    // Apply transformations (mirror, crop...) to the image
-    int offset = this->prefetch_data_.offset(item_id);
-    this->transformed_data_.set_cpu_data(prefetch_data + offset);
-    this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
-    trans_time += timer.MicroSeconds();
-
-    prefetch_label[item_id] = lines_[lines_id_].second;
-    // go to the next iter
-    lines_id_++;
-    if (lines_id_ >= lines_size) {
-      // We have reached the end. Restart from the first.
-      DLOG(INFO) << "Restarting data prefetching from start.";
-      lines_id_ = 0;
-      if (this->layer_param_.image_data_param().shuffle()) {
-        ShuffleImages();
-      }
-    }
-  }
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+	CPUTimer batch_timer;
+	batch_timer.Start();
+	double read_time = 0;
+	double trans_time = 0;
+	CPUTimer timer;
+	CHECK(this->prefetch_data_.count());
+	CHECK(this->transformed_data_.count());
+	ImageDataParameter image_data_param = this->layer_param_.image_data_param();
+	const int batch_size = image_data_param.batch_size();
+	const int new_height = image_data_param.new_height();
+	const int new_width = image_data_param.new_width();
+	const bool is_color = image_data_param.is_color();
+	string root_folder = image_data_param.root_folder();
+
+	// Reshape according to the first image of each batch
+	// on single input batches allows for inputs of varying dimension.
+	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+		new_height, new_width, is_color);
+	// Use data_transformer to infer the expected blob shape from a cv_img.
+	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
+	this->transformed_data_.Reshape(top_shape);
+	// Reshape prefetch_data according to the batch_size.
+	top_shape[0] = batch_size;
+	this->prefetch_data_.Reshape(top_shape);
+
+	Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
+	Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
+
+	// datum scales
+	const int lines_size = lines_.size();
+	for (int item_id = 0; item_id < batch_size; ++item_id) {
+		// get a blob
+		timer.Start();
+		CHECK_GT(lines_size, lines_id_);
+		cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+			new_height, new_width, is_color);
+		CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
+		read_time += timer.MicroSeconds();
+		timer.Start();
+		// Apply transformations (mirror, crop...) to the image
+		int offset = this->prefetch_data_.offset(item_id);
+		this->transformed_data_.set_cpu_data(prefetch_data + offset);
+		this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
+		trans_time += timer.MicroSeconds();
+
+		prefetch_label[item_id] = lines_[lines_id_].second;
+		// go to the next iter
+		lines_id_++;
+		if (lines_id_ >= lines_size) {
+			// We have reached the end. Restart from the first.
+			DLOG(INFO) << "Restarting data prefetching from start.";
+			lines_id_ = 0;
+			if (this->layer_param_.image_data_param().shuffle()) {
+				ShuffleImages();
+			}
+		}
+	}
+	batch_timer.Stop();
+	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(ImageDataLayer);
-REGISTER_LAYER_CLASS(ImageData);
+INSTANTIATE_CLASS (ImageDataLayer);
+REGISTER_LAYER_CLASS (ImageData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index a1e0b40d..e5294a7e 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -10,101 +10,100 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
-  if (bottom.size() < 3) {
-    CHECK(this->layer_param_.infogain_loss_param().has_source())
-        << "Infogain matrix source must be specified.";
-    BlobProto blob_proto;
-    ReadProtoFromBinaryFile(
-      this->layer_param_.infogain_loss_param().source(), &blob_proto);
-    infogain_.FromProto(blob_proto);
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::LayerSetUp(bottom, top);
+	if (bottom.size() < 3) {
+		CHECK(this->layer_param_.infogain_loss_param().has_source())
+			<< "Infogain matrix source must be specified.";
+		BlobProto blob_proto;
+		ReadProtoFromBinaryFile(
+			this->layer_param_.infogain_loss_param().source(), &blob_proto);
+		infogain_.FromProto(blob_proto);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  Blob<Dtype>* infogain = NULL;
-  if (bottom.size() < 3) {
-    infogain = &infogain_;
-  } else {
-    infogain = bottom[2];
-  }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::Reshape(bottom, top);
+	Blob < Dtype > *infogain = NULL;
+	if (bottom.size() < 3) {
+		infogain = &infogain_;
+	} else {
+		infogain = bottom[2];
+	}
+	CHECK_EQ(bottom[1]->channels(), 1);
+	CHECK_EQ(bottom[1]->height(), 1);
+	CHECK_EQ(bottom[1]->width(), 1);
+	const int num = bottom[0]->num();
+	const int dim = bottom[0]->count() / num;
+	CHECK_EQ(infogain->num(), 1);
+	CHECK_EQ(infogain->channels(), 1);
+	CHECK_EQ(infogain->height(), dim);
+	CHECK_EQ(infogain->width(), dim);
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
-  const Dtype* infogain_mat = NULL;
-  if (bottom.size() < 3) {
-    infogain_mat = infogain_.cpu_data();
-  } else {
-    infogain_mat = bottom[2]->cpu_data();
-  }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
-  Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
-    }
-  }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* bottom_label = bottom[1]->cpu_data();
+	const Dtype* infogain_mat = NULL;
+	if (bottom.size() < 3) {
+		infogain_mat = infogain_.cpu_data();
+	} else {
+		infogain_mat = bottom[2]->cpu_data();
+	}
+	int num = bottom[0]->num();
+	int dim = bottom[0]->count() / bottom[0]->num();
+	Dtype loss = 0;
+	for (int i = 0; i < num; ++i) {
+		int label = static_cast<int>(bottom_label[i]);
+		for (int j = 0; j < dim; ++j) {
+			Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
+			loss -= infogain_mat[label * dim + j] * log(prob);
+		}
+	}
+	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down.size() > 2 && propagate_down[2]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to infogain inputs.";
-  }
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    const Dtype* bottom_label = bottom[1]->cpu_data();
-    const Dtype* infogain_mat = NULL;
-    if (bottom.size() < 3) {
-      infogain_mat = infogain_.cpu_data();
-    } else {
-      infogain_mat = bottom[2]->cpu_data();
-    }
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
-      }
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down.size() > 2 && propagate_down[2]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to infogain inputs.";
+	}
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		const Dtype* bottom_label = bottom[1]->cpu_data();
+		const Dtype* infogain_mat = NULL;
+		if (bottom.size() < 3) {
+			infogain_mat = infogain_.cpu_data();
+		} else {
+			infogain_mat = bottom[2]->cpu_data();
+		}
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		int num = bottom[0]->num();
+		int dim = bottom[0]->count() / bottom[0]->num();
+		const Dtype scale = -top[0]->cpu_diff()[0] / num;
+		for (int i = 0; i < num; ++i) {
+			const int label = static_cast<int>(bottom_label[i]);
+			for (int j = 0; j < dim; ++j) {
+				Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
+				bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+			}
+		}
+	}
 }
 
-INSTANTIATE_CLASS(InfogainLossLayer);
-REGISTER_LAYER_CLASS(InfogainLoss);
+INSTANTIATE_CLASS (InfogainLossLayer);
+REGISTER_LAYER_CLASS (InfogainLoss);
 }  // namespace caffe
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 676650c2..e563aa21 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -9,164 +9,168 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int num_output = this->layer_param_.inner_product_param().num_output();
-  bias_term_ = this->layer_param_.inner_product_param().bias_term();
-  N_ = num_output;
-  const int axis = bottom[0]->CanonicalAxisIndex(
-      this->layer_param_.inner_product_param().axis());
-  // Dimensions starting from "axis" are "flattened" into a single
-  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
-  // and axis == 1, N inner products with dimension CHW are performed.
-  K_ = bottom[0]->count(axis);
-  // Check if we need to set up the weights
-  if (this->blobs_.size() > 0) {
-    LOG(INFO) << "Skipping parameter initialization";
-  } else {
-    if (bias_term_) {
-      this->blobs_.resize(2);
-    } else {
-      this->blobs_.resize(1);
-    }
-    // Intialize the weight
-    vector<int> weight_shape(2);
-    weight_shape[0] = N_;
-    weight_shape[1] = K_;
-    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
-    // fill the weights
-    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.inner_product_param().weight_filler()));
-    weight_filler->Fill(this->blobs_[0].get());
-    // If necessary, intiialize and fill the bias term
-    if (bias_term_) {
-      vector<int> bias_shape(1, N_);
-      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.inner_product_param().bias_filler()));
-      bias_filler->Fill(this->blobs_[1].get());
-    }
-  }  // parameter initialization
-  this->param_propagate_down_.resize(this->blobs_.size(), true);
+	const vector<Blob<Dtype>*>& top) {
+	const int num_output = this->layer_param_.inner_product_param().num_output();
+	bias_term_ = this->layer_param_.inner_product_param().bias_term();
+	N_ = num_output;
+	const int axis = bottom[0]->CanonicalAxisIndex(
+		this->layer_param_.inner_product_param().axis());
+	// Dimensions starting from "axis" are "flattened" into a single
+	// length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
+	// and axis == 1, N inner products with dimension CHW are performed.
+	K_ = bottom[0]->count(axis);
+	// Check if we need to set up the weights
+	if (this->blobs_.size() > 0) {
+		LOG(INFO) << "Skipping parameter initialization";
+	} else {
+		if (bias_term_) {
+			this->blobs_.resize(2);
+		} else {
+			this->blobs_.resize(1);
+		}
+		// Intialize the weight
+		vector<int> weight_shape(2);
+		weight_shape[0] = N_;
+		weight_shape[1] = K_;
+		this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+		// fill the weights
+		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
+			this->layer_param_.inner_product_param().weight_filler()));
+		weight_filler->Fill(this->blobs_[0].get());
+		// If necessary, intiialize and fill the bias term
+		if (bias_term_) {
+			vector<int> bias_shape(1, N_);
+			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
+				this->layer_param_.inner_product_param().bias_filler()));
+			bias_filler->Fill(this->blobs_[1].get());
+		}
+	}  // parameter initialization
+	this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Figure out the dimensions
-  const int axis = bottom[0]->CanonicalAxisIndex(
-      this->layer_param_.inner_product_param().axis());
-  const int new_K = bottom[0]->count(axis);
-  CHECK_EQ(K_, new_K)
-      << "Input size incompatible with inner product parameters.";
-  // The first "axis" dimensions are independent inner products; the total
-  // number of these is M_, the product over these dimensions.
-  M_ = bottom[0]->count(0, axis);
-  // The top shape will be the bottom shape with the flattened axes dropped,
-  // and replaced by a single axis with dimension num_output (N_).
-  vector<int> top_shape = bottom[0]->shape();
-  top_shape.resize(axis + 1);
-  top_shape[axis] = N_;
-  top[0]->Reshape(top_shape);
-  // Set up the bias multiplier
-  if (bias_term_) {
-    vector<int> bias_shape(1, M_);
-    bias_multiplier_.Reshape(bias_shape);
-    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
-  }
+	const vector<Blob<Dtype>*>& top) {
+	// Figure out the dimensions
+	const int axis = bottom[0]->CanonicalAxisIndex(
+		this->layer_param_.inner_product_param().axis());
+	const int new_K = bottom[0]->count(axis);
+	CHECK_EQ(K_, new_K)
+		<< "Input size incompatible with inner product parameters.";
+	// The first "axis" dimensions are independent inner products; the total
+	// number of these is M_, the product over these dimensions.
+	M_ = bottom[0]->count(0, axis);
+	// The top shape will be the bottom shape with the flattened axes dropped,
+	// and replaced by a single axis with dimension num_output (N_).
+	vector<int> top_shape = bottom[0]->shape();
+	top_shape.resize(axis + 1);
+	top_shape[axis] = N_;
+	top[0]->Reshape(top_shape);
+	// Set up the bias multiplier
+	if (bias_term_) {
+		vector<int> bias_shape(1, M_);
+		bias_multiplier_.Reshape(bias_shape);
+		caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const Dtype* weight = this->blobs_[0]->cpu_data();
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
-  if (bias_term_) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.cpu_data(),
-        this->blobs_[1]->cpu_data(), (Dtype)1., top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const Dtype* weight = this->blobs_[0]->cpu_data();
+	caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
+		bottom_data, weight, (Dtype) 0., top_data);
+	if (bias_term_) {
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1.,
+			bias_multiplier_.cpu_data(),
+			this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    // Gradient with respect to weight
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    // Gradient with respect to bias
-    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.cpu_data(), (Dtype)1.,
-        this->blobs_[1]->mutable_cpu_diff());
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->cpu_diff();
-    // Gradient with respect to bottom data
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
-        bottom[0]->mutable_cpu_diff());
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (this->param_propagate_down_[0]) {
+		const Dtype* top_diff = top[0]->cpu_diff();
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		// Gradient with respect to weight
+		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
+			top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
+	}
+	if (bias_term_ && this->param_propagate_down_[1]) {
+		const Dtype* top_diff = top[0]->cpu_diff();
+		// Gradient with respect to bias
+		caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff,
+			bias_multiplier_.cpu_data(), (Dtype) 1.,
+			this->blobs_[1]->mutable_cpu_diff());
+	}
+	if (propagate_down[0]) {
+		const Dtype* top_diff = top[0]->cpu_diff();
+		// Gradient with respect to bottom data
+		caffe_cpu_gemm < Dtype
+			> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
+				top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0.,
+				bottom[0]->mutable_cpu_diff());
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_,(Dtype)1.,
-      bottom_data, 0, weight, 0, (Dtype)0., top_data, 0);
-  if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),0,
-        this->blobs_[1]->gpu_data(), 0, (Dtype)1., top_data, 0);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const Dtype* weight = this->blobs_[0]->gpu_data();
+	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
+		bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
+	if (bias_term_) {
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1.,
+			bias_multiplier_.gpu_data(), 0,
+			this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    // Gradient with respect to weight
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, 0, bottom_data, 0, (Dtype)1., this->blobs_[0]->mutable_gpu_diff(), 0);
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., (Dtype*)top_diff,
-        (size_t)0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
-         (size_t)0, (Dtype)0., 1,
-        this->blobs_[1]->mutable_gpu_diff(), (size_t)0, 1);
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype)0.,
-        bottom[0]->mutable_gpu_diff(), 0);
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (this->param_propagate_down_[0]) {
+		const Dtype* top_diff = top[0]->gpu_diff();
+		const Dtype* bottom_data = bottom[0]->gpu_data();
+		// Gradient with respect to weight
+		caffe_gpu_gemm < Dtype
+			> (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
+				top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
+	}
+	if (bias_term_ && this->param_propagate_down_[1]) {
+		const Dtype* top_diff = top[0]->gpu_diff();
+		// Gradient with respect to bias
+		caffe_gpu_gemv < Dtype
+			> (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff,
+				(size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
+				(size_t) 0, (Dtype) 0., 1,
+				this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
+	}
+	if (propagate_down[0]) {
+		const Dtype* top_diff = top[0]->gpu_diff();
+		// Gradient with respect to bottom data
+		caffe_gpu_gemm < Dtype
+			> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
+				top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0.,
+				bottom[0]->mutable_gpu_diff(), 0);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(InnerProductLayer);
 #endif
 
-INSTANTIATE_CLASS(InnerProductLayer);
-REGISTER_LAYER_CLASS(InnerProduct);
+INSTANTIATE_CLASS (InnerProductLayer);
+REGISTER_LAYER_CLASS (InnerProduct);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 268c5f5b..e388dfef 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -7,128 +7,130 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  const Dtype base = this->layer_param_.log_param().base();
-  if (base != Dtype(-1)) {
-    CHECK_GT(base, 0) << "base must be strictly positive.";
-  }
-  // If base == -1, interpret the base as e and set log_base = 1 exactly.
-  // Otherwise, calculate its log explicitly.
-  const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-  CHECK(!isnan(log_base))
-      << "NaN result: log(base) = log(" << base << ") = " << log_base;
-  CHECK(!isinf(log_base))
-      << "Inf result: log(base) = log(" << base << ") = " << log_base;
-  base_scale_ = Dtype(1) / log_base;
-  CHECK(!isnan(base_scale_))
-      << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
-  CHECK(!isinf(base_scale_))
-      << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
-  input_scale_ = this->layer_param_.log_param().scale();
-  input_shift_ = this->layer_param_.log_param().shift();
-  backward_num_scale_ = input_scale_ / log_base;
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	const Dtype base = this->layer_param_.log_param().base();
+	if (base != Dtype(-1)) {
+		CHECK_GT(base, 0) << "base must be strictly positive.";
+	}
+	// If base == -1, interpret the base as e and set log_base = 1 exactly.
+	// Otherwise, calculate its log explicitly.
+	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
+	CHECK(!isnan(log_base))
+		<< "NaN result: log(base) = log(" << base << ") = " << log_base;
+	CHECK(!isinf(log_base))
+		<< "Inf result: log(base) = log(" << base << ") = " << log_base;
+	base_scale_ = Dtype(1) / log_base;
+	CHECK(!isnan(base_scale_))
+		<< "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
+	CHECK(!isinf(base_scale_))
+		<< "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
+	input_scale_ = this->layer_param_.log_param().scale();
+	input_shift_ = this->layer_param_.log_param().shift();
+	backward_num_scale_ = input_scale_ / log_base;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LogLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_log(count, bottom_data, top_data);
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_scal(count, base_scale_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+		caffe_log(count, bottom_data, top_data);
+	} else {
+		caffe_copy(count, bottom_data, top_data);
+		if (input_scale_ != Dtype(1)) {
+			caffe_scal(count, input_scale_, top_data);
+		}
+		if (input_shift_ != Dtype(0)) {
+			caffe_add_scalar(count, input_shift_, top_data);
+		}
+		caffe_log(count, top_data, top_data);
+	}
+	if (base_scale_ != Dtype(1)) {
+		caffe_scal(count, base_scale_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  caffe_copy(count, bottom_data, bottom_diff);
-  if (input_scale_ != Dtype(1)) {
-    caffe_scal(count, input_scale_, bottom_diff);
-  }
-  if (input_shift_ != Dtype(0)) {
-    caffe_add_scalar(count, input_shift_, bottom_diff);
-  }
-  caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-  if (backward_num_scale_ != Dtype(1)) {
-    caffe_scal(count, backward_num_scale_, bottom_diff);
-  }
-  caffe_mul(count, top_diff, bottom_diff, bottom_diff);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	caffe_copy(count, bottom_data, bottom_diff);
+	if (input_scale_ != Dtype(1)) {
+		caffe_scal(count, input_scale_, bottom_diff);
+	}
+	if (input_shift_ != Dtype(0)) {
+		caffe_add_scalar(count, input_shift_, bottom_diff);
+	}
+	caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+	if (backward_num_scale_ != Dtype(1)) {
+		caffe_scal(count, backward_num_scale_, bottom_diff);
+	}
+	caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_gpu_log(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_gpu_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, base_scale_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+		caffe_gpu_log(count, bottom_data, top_data);
+	} else {
+		caffe_gpu_copy(count, bottom_data, top_data);
+		if (input_scale_ != Dtype(1)) {
+			caffe_gpu_scal(count, input_scale_, top_data);
+		}
+		if (input_shift_ != Dtype(0)) {
+			caffe_gpu_add_scalar(count, input_shift_, top_data);
+		}
+		caffe_gpu_log(count, top_data, top_data);
+	}
+	if (base_scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, base_scale_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (!propagate_down[0]) { return; }
-    const int count = bottom[0]->count();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_copy(count, bottom_data, bottom_diff);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, bottom_diff);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-    }
-    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-    if (backward_num_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	const int count = bottom[0]->count();
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	const Dtype* top_diff = top[0]->gpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	caffe_gpu_copy(count, bottom_data, bottom_diff);
+	if (input_scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, input_scale_, bottom_diff);
+	}
+	if (input_shift_ != Dtype(0)) {
+		caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
+	}
+	caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+	if (backward_num_scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
+	}
+	caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-
-
 #ifdef CPU_ONLY
 STUB_GPU(LogLayer);
 #endif
 
-INSTANTIATE_CLASS(LogLayer);
-REGISTER_LAYER_CLASS(Log);
+INSTANTIATE_CLASS (LogLayer);
+REGISTER_LAYER_CLASS (Log);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 3496a5c2..503014f5 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -10,24 +10,24 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void LossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // LossLayers have a non-zero (1) loss by default.
-  if (this->layer_param_.loss_weight_size() == 0) {
-    this->layer_param_.add_loss_weight(Dtype(1));
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// LossLayers have a non-zero (1) loss by default.
+	if (this->layer_param_.loss_weight_size() == 0) {
+		this->layer_param_.add_loss_weight(Dtype(1));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-      << "The data and label should have the same number.";
-  vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
-  top[0]->Reshape(loss_shape);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(bottom[0]->num(), bottom[1]->num())
+		<< "The data and label should have the same number.";
+	vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
+	top[0]->Reshape(loss_shape);
 }
 
-INSTANTIATE_CLASS(LossLayer);
+INSTANTIATE_CLASS (LossLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index d2f1c247..0f936f22 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -8,311 +8,311 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  size_ = this->layer_param_.lrn_param().local_size();
-  CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
-  pre_pad_ = (size_ - 1) / 2;
-  alpha_ = this->layer_param_.lrn_param().alpha();
-  beta_ = this->layer_param_.lrn_param().beta();
-  k_ = this->layer_param_.lrn_param().k();
-  if (this->layer_param_.lrn_param().norm_region() ==
-      LRNParameter_NormRegion_WITHIN_CHANNEL) {
-    // Set up split_layer_ to use inputs in the numerator and denominator.
-    split_top_vec_.clear();
-    split_top_vec_.push_back(&product_input_);
-    split_top_vec_.push_back(&square_input_);
-    LayerParameter split_param;
-    split_layer_.reset(new SplitLayer<Dtype>(split_param));
-    split_layer_->SetUp(bottom, split_top_vec_);
-    // Set up square_layer_ to square the inputs.
-    square_bottom_vec_.clear();
-    square_top_vec_.clear();
-    square_bottom_vec_.push_back(&square_input_);
-    square_top_vec_.push_back(&square_output_);
-    LayerParameter square_param;
-    square_param.mutable_power_param()->set_power(Dtype(2));
-    square_layer_.reset(new PowerLayer<Dtype>(square_param));
-    square_layer_->SetUp(square_bottom_vec_, square_top_vec_);
-    // Set up pool_layer_ to sum over square neighborhoods of the input.
-    pool_top_vec_.clear();
-    pool_top_vec_.push_back(&pool_output_);
-    LayerParameter pool_param;
-    pool_param.mutable_pooling_param()->set_pool(
-        PoolingParameter_PoolMethod_AVE);
-    pool_param.mutable_pooling_param()->set_pad(pre_pad_);
-    pool_param.mutable_pooling_param()->set_kernel_size(size_);
-    pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
-    pool_layer_->SetUp(square_top_vec_, pool_top_vec_);
-    // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
-    // the sum of a squared neighborhood (the output of pool_layer_).
-    power_top_vec_.clear();
-    power_top_vec_.push_back(&power_output_);
-    LayerParameter power_param;
-    power_param.mutable_power_param()->set_power(-beta_);
-    power_param.mutable_power_param()->set_scale(alpha_);
-    power_param.mutable_power_param()->set_shift(Dtype(1));
-    power_layer_.reset(new PowerLayer<Dtype>(power_param));
-    power_layer_->SetUp(pool_top_vec_, power_top_vec_);
-    // Set up a product_layer_ to compute outputs by multiplying inputs by the
-    // inverse demoninator computed by the power layer.
-    product_bottom_vec_.clear();
-    product_bottom_vec_.push_back(&product_input_);
-    product_bottom_vec_.push_back(&power_output_);
-    LayerParameter product_param;
-    EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param();
-    eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-    product_layer_.reset(new EltwiseLayer<Dtype>(product_param));
-    product_layer_->SetUp(product_bottom_vec_, top);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	size_ = this->layer_param_.lrn_param().local_size();
+	CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
+	pre_pad_ = (size_ - 1) / 2;
+	alpha_ = this->layer_param_.lrn_param().alpha();
+	beta_ = this->layer_param_.lrn_param().beta();
+	k_ = this->layer_param_.lrn_param().k();
+	if (this->layer_param_.lrn_param().norm_region() ==
+		LRNParameter_NormRegion_WITHIN_CHANNEL) {
+		// Set up split_layer_ to use inputs in the numerator and denominator.
+		split_top_vec_.clear();
+		split_top_vec_.push_back(&product_input_);
+		split_top_vec_.push_back(&square_input_);
+		LayerParameter split_param;
+		split_layer_.reset(new SplitLayer<Dtype>(split_param));
+		split_layer_->SetUp(bottom, split_top_vec_);
+		// Set up square_layer_ to square the inputs.
+		square_bottom_vec_.clear();
+		square_top_vec_.clear();
+		square_bottom_vec_.push_back(&square_input_);
+		square_top_vec_.push_back(&square_output_);
+		LayerParameter square_param;
+		square_param.mutable_power_param()->set_power(Dtype(2));
+		square_layer_.reset(new PowerLayer<Dtype>(square_param));
+		square_layer_->SetUp(square_bottom_vec_, square_top_vec_);
+		// Set up pool_layer_ to sum over square neighborhoods of the input.
+		pool_top_vec_.clear();
+		pool_top_vec_.push_back(&pool_output_);
+		LayerParameter pool_param;
+		pool_param.mutable_pooling_param()->set_pool(
+			PoolingParameter_PoolMethod_AVE);
+		pool_param.mutable_pooling_param()->set_pad(pre_pad_);
+		pool_param.mutable_pooling_param()->set_kernel_size(size_);
+		pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
+		pool_layer_->SetUp(square_top_vec_, pool_top_vec_);
+		// Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
+		// the sum of a squared neighborhood (the output of pool_layer_).
+		power_top_vec_.clear();
+		power_top_vec_.push_back(&power_output_);
+		LayerParameter power_param;
+		power_param.mutable_power_param()->set_power(-beta_);
+		power_param.mutable_power_param()->set_scale(alpha_);
+		power_param.mutable_power_param()->set_shift(Dtype(1));
+		power_layer_.reset(new PowerLayer<Dtype>(power_param));
+		power_layer_->SetUp(pool_top_vec_, power_top_vec_);
+		// Set up a product_layer_ to compute outputs by multiplying inputs by the
+		// inverse demoninator computed by the power layer.
+		product_bottom_vec_.clear();
+		product_bottom_vec_.push_back(&product_input_);
+		product_bottom_vec_.push_back(&power_output_);
+		LayerParameter product_param;
+		EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param();
+		eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
+		product_layer_.reset(new EltwiseLayer<Dtype>(product_param));
+		product_layer_->SetUp(product_bottom_vec_, top);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  num_ = bottom[0]->num();
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    top[0]->Reshape(num_, channels_, height_, width_);
-    scale_.Reshape(num_, channels_, height_, width_);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    split_layer_->Reshape(bottom, split_top_vec_);
-    square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
-    pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
-    power_layer_->Reshape(pool_top_vec_, power_top_vec_);
-    product_layer_->Reshape(product_bottom_vec_, top);
-    break;
-  }
-    LFSkernel = clCreateKernel(amdDevice.Program,"LRNFillScalefloat",NULL);
-    LCDkernel = clCreateKernel(amdDevice.Program,"LRNComputeDifffloat",NULL);
-    LCOkernel = clCreateKernel(amdDevice.Program,"LRNComputeOutputfloat",NULL);
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	num_ = bottom[0]->num();
+	channels_ = bottom[0]->channels();
+	height_ = bottom[0]->height();
+	width_ = bottom[0]->width();
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			top[0]->Reshape(num_, channels_, height_, width_);
+			scale_.Reshape(num_, channels_, height_, width_);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			split_layer_->Reshape(bottom, split_top_vec_);
+			square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
+			pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
+			power_layer_->Reshape(pool_top_vec_, power_top_vec_);
+			product_layer_->Reshape(product_bottom_vec_, top);
+			break;
+	}
+	LFSkernel = clCreateKernel(amdDevice.Program, "LRNFillScalefloat", NULL);
+	LCDkernel = clCreateKernel(amdDevice.Program, "LRNComputeDifffloat", NULL);
+	LCOkernel = clCreateKernel(amdDevice.Program, "LRNComputeOutputfloat", NULL);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_cpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			CrossChannelForward_cpu(bottom, top);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			WithinChannelForward(bottom, top);
+			break;
+		default:
+			LOG(FATAL) << "Unknown normalization region.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  Dtype* scale_data = scale_.mutable_cpu_data();
-  // start with the constant value
-  for (int i = 0; i < scale_.count(); ++i) {
-    scale_data[i] = k_;
-  }
-  Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
-  Dtype* padded_square_data = padded_square.mutable_cpu_data();
-  caffe_set(padded_square.count(), Dtype(0), padded_square_data);
-  Dtype alpha_over_size = alpha_ / size_;
-  // go through the images
-  for (int n = 0; n < num_; ++n) {
-    // compute the padded square
-    caffe_sqr(channels_ * height_ * width_,
-        bottom_data + bottom[0]->offset(n),
-        padded_square_data + padded_square.offset(0, pre_pad_));
-    // Create the first channel scale
-    for (int c = 0; c < size_; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
-          padded_square_data + padded_square.offset(0, c),
-          scale_data + scale_.offset(n, 0));
-    }
-    for (int c = 1; c < channels_; ++c) {
-      // copy previous scale
-      caffe_copy<Dtype>(height_ * width_,
-          scale_data + scale_.offset(n, c - 1),
-          scale_data + scale_.offset(n, c));
-      // add head
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
-          padded_square_data + padded_square.offset(0, c + size_ - 1),
-          scale_data + scale_.offset(n, c));
-      // subtract tail
-      caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
-          padded_square_data + padded_square.offset(0, c - 1),
-          scale_data + scale_.offset(n, c));
-    }
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	Dtype* scale_data = scale_.mutable_cpu_data();
+	// start with the constant value
+	for (int i = 0; i < scale_.count(); ++i) {
+		scale_data[i] = k_;
+	}
+	Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_);
+	Dtype* padded_square_data = padded_square.mutable_cpu_data();
+	caffe_set(padded_square.count(), Dtype(0), padded_square_data);
+	Dtype alpha_over_size = alpha_ / size_;
+	// go through the images
+	for (int n = 0; n < num_; ++n) {
+		// compute the padded square
+		caffe_sqr(channels_ * height_ * width_,
+			bottom_data + bottom[0]->offset(n),
+			padded_square_data + padded_square.offset(0, pre_pad_));
+		// Create the first channel scale
+		for (int c = 0; c < size_; ++c) {
+			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
+				padded_square_data + padded_square.offset(0, c),
+				scale_data + scale_.offset(n, 0));
+		}
+		for (int c = 1; c < channels_; ++c) {
+			// copy previous scale
+			caffe_copy < Dtype > (height_ * width_,
+				scale_data + scale_.offset(n, c - 1),
+				scale_data + scale_.offset(n, c));
+			// add head
+			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
+				padded_square_data + padded_square.offset(0, c + size_ - 1),
+				scale_data + scale_.offset(n, c));
+			// subtract tail
+			caffe_axpy < Dtype > (height_ * width_, -alpha_over_size,
+				padded_square_data + padded_square.offset(0, c - 1),
+				scale_data + scale_.offset(n, c));
+		}
+	}
 
-  // In the end, compute output
-  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
-  caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
+	// In the end, compute output
+	caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data);
+	caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::WithinChannelForward(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  split_layer_->Forward(bottom, split_top_vec_);
-  square_layer_->Forward(square_bottom_vec_, square_top_vec_);
-  pool_layer_->Forward(square_top_vec_, pool_top_vec_);
-  power_layer_->Forward(pool_top_vec_, power_top_vec_);
-  product_layer_->Forward(product_bottom_vec_, top);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	split_layer_->Forward(bottom, split_top_vec_);
+	square_layer_->Forward(square_bottom_vec_, square_top_vec_);
+	pool_layer_->Forward(square_top_vec_, pool_top_vec_);
+	power_layer_->Forward(pool_top_vec_, power_top_vec_);
+	product_layer_->Forward(product_bottom_vec_, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_cpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			CrossChannelBackward_cpu(top, propagate_down, bottom);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			WithinChannelBackward(top, propagate_down, bottom);
+			break;
+		default:
+			LOG(FATAL) << "Unknown normalization region.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* scale_data = scale_.cpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
-  Blob<Dtype> accum_ratio(1, 1, height_, width_);
-  Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
-  Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
-  // We hack a little bit by using the diff() to store an additional result
-  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
-  caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
-  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
+	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->cpu_diff();
+	const Dtype* top_data = top[0]->cpu_data();
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* scale_data = scale_.cpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_);
+	Blob < Dtype > accum_ratio(1, 1, height_, width_);
+	Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
+	Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
+	// We hack a little bit by using the diff() to store an additional result
+	Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
+	caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
+	Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
 
-  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
-  caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);
+	caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff);
+	caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff);
 
-  // go through individual data
-  int inverse_pre_pad = size_ - (size_ + 1) / 2;
-  for (int n = 0; n < num_; ++n) {
-    int block_offset = scale_.offset(n);
-    // first, compute diff_i * y_i / s_i
-    caffe_mul<Dtype>(channels_ * height_ * width_,
-        top_diff + block_offset, top_data + block_offset,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
-    caffe_div<Dtype>(channels_ * height_ * width_,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
-        scale_data + block_offset,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
-    // Now, compute the accumulated ratios and the bottom diff
-    caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
-    for (int c = 0; c < size_ - 1; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, 1.,
-          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
-    }
-    for (int c = 0; c < channels_; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, 1.,
-          padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
-          accum_ratio_data);
-      // compute bottom diff
-      caffe_mul<Dtype>(height_ * width_,
-          bottom_data + top[0]->offset(n, c),
-          accum_ratio_data, accum_ratio_times_bottom);
-      caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,
-          accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
-      caffe_axpy<Dtype>(height_ * width_, -1.,
-          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
-    }
-  }
+	// go through individual data
+	int inverse_pre_pad = size_ - (size_ + 1) / 2;
+	for (int n = 0; n < num_; ++n) {
+		int block_offset = scale_.offset(n);
+		// first, compute diff_i * y_i / s_i
+		caffe_mul < Dtype > (channels_ * height_ * width_,
+			top_diff + block_offset, top_data + block_offset,
+			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+		caffe_div < Dtype > (channels_ * height_ * width_,
+			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
+			scale_data + block_offset,
+			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+		// Now, compute the accumulated ratios and the bottom diff
+		caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
+		for (int c = 0; c < size_ - 1; ++c) {
+			caffe_axpy < Dtype > (height_ * width_, 1.,
+				padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+		}
+		for (int c = 0; c < channels_; ++c) {
+			caffe_axpy < Dtype > (height_ * width_, 1.,
+				padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
+				accum_ratio_data);
+			// compute bottom diff
+			caffe_mul < Dtype > (height_ * width_,
+				bottom_data + top[0]->offset(n, c),
+				accum_ratio_data, accum_ratio_times_bottom);
+			caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value,
+				accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
+			caffe_axpy < Dtype > (height_ * width_, -1.,
+				padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::WithinChannelBackward(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    vector<bool> product_propagate_down(2, true);
-    product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
-    power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
-    pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
-    square_layer_->Backward(square_top_vec_, propagate_down,
-                            square_bottom_vec_);
-    split_layer_->Backward(split_top_vec_, propagate_down, bottom);
-  }
+	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		vector<bool> product_propagate_down(2, true);
+		product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
+		power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
+		pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
+		square_layer_->Backward(square_top_vec_, propagate_down,
+			square_bottom_vec_);
+		split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale(LFSkernel,
-      n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput(LCOkernel,
-      n_threads, bottom_data, scale_data, -beta_, top_data);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// First, compute scale
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	Dtype* scale_data = scale_.mutable_gpu_data();
+	// We will launch one kernel for each pixel location, and have the kernel
+	// go through all the channels.
+	int n_threads = num_ * height_ * width_;
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNFillScale(LFSkernel,
+		n_threads, bottom_data, num_, channels_, height_, width_, size_,
+		alpha_ / size_, k_, scale_data);
+	n_threads = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNComputeOutput(LCOkernel,
+		n_threads, bottom_data, scale_data, -beta_, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff(LCDkernel,
-      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
+	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	int n_threads = num_ * height_ * width_;
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNComputeDiff(LCDkernel,
+		n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+		scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+		size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+		bottom[0]->mutable_gpu_diff());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_gpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			CrossChannelForward_gpu(bottom, top);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			WithinChannelForward(bottom, top);
+			break;
+		default:
+			LOG(FATAL) << "Unknown normalization region.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_gpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			CrossChannelBackward_gpu(top, propagate_down, bottom);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			WithinChannelBackward(top, propagate_down, bottom);
+			break;
+		default:
+			LOG(FATAL) << "Unknown normalization region.";
+	}
 }
 #ifdef CPU_ONLY
 STUB_GPU(LRNLayer);
@@ -320,7 +320,7 @@ STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
 STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
 #endif
 
-INSTANTIATE_CLASS(LRNLayer);
-REGISTER_LAYER_CLASS(LRN);
+INSTANTIATE_CLASS (LRNLayer);
+REGISTER_LAYER_CLASS (LRN);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 42de4198..2cd04f93 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -8,114 +8,114 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-     const vector<Blob<Dtype>*>& top) {
-  batch_size_ = this->layer_param_.memory_data_param().batch_size();
-  channels_ = this->layer_param_.memory_data_param().channels();
-  height_ = this->layer_param_.memory_data_param().height();
-  width_ = this->layer_param_.memory_data_param().width();
-  size_ = channels_ * height_ * width_;
-  CHECK_GT(batch_size_ * size_, 0) <<
-      "batch_size, channels, height, and width must be specified and"
-      " positive in memory_data_param";
-  vector<int> label_shape(1, batch_size_);
-  top[0]->Reshape(batch_size_, channels_, height_, width_);
-  top[1]->Reshape(label_shape);
-  added_data_.Reshape(batch_size_, channels_, height_, width_);
-  added_label_.Reshape(label_shape);
-  data_ = NULL;
-  labels_ = NULL;
-  added_data_.cpu_data();
-  added_label_.cpu_data();
+	const vector<Blob<Dtype>*>& top) {
+	batch_size_ = this->layer_param_.memory_data_param().batch_size();
+	channels_ = this->layer_param_.memory_data_param().channels();
+	height_ = this->layer_param_.memory_data_param().height();
+	width_ = this->layer_param_.memory_data_param().width();
+	size_ = channels_ * height_ * width_;
+	CHECK_GT(batch_size_ * size_, 0) <<
+		"batch_size, channels, height, and width must be specified and"
+			" positive in memory_data_param";
+	vector<int> label_shape(1, batch_size_);
+	top[0]->Reshape(batch_size_, channels_, height_, width_);
+	top[1]->Reshape(label_shape);
+	added_data_.Reshape(batch_size_, channels_, height_, width_);
+	added_label_.Reshape(label_shape);
+	data_ = NULL;
+	labels_ = NULL;
+	added_data_.cpu_data();
+	added_label_.cpu_data();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
-  CHECK(!has_new_data_) <<
-      "Can't add data until current data has been consumed.";
-  size_t num = datum_vector.size();
-  CHECK_GT(num, 0) << "There is no datum to add.";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
-  added_data_.Reshape(num, channels_, height_, width_);
-  added_label_.Reshape(num, 1, 1, 1);
-  // Apply data transformations (mirror, scale, crop...)
-  this->data_transformer_->Transform(datum_vector, &added_data_);
-  // Copy Labels
-  Dtype* top_label = added_label_.mutable_cpu_data();
-  for (int item_id = 0; item_id < num; ++item_id) {
-    top_label[item_id] = datum_vector[item_id].label();
-  }
-  // num_images == batch_size_
-  Dtype* top_data = added_data_.mutable_cpu_data();
-  Reset(top_data, top_label, num);
-  has_new_data_ = true;
+	CHECK(!has_new_data_) <<
+		"Can't add data until current data has been consumed.";
+	size_t num = datum_vector.size();
+	CHECK_GT(num, 0) << "There is no datum to add.";
+	CHECK_EQ(num % batch_size_, 0) <<
+		"The added data must be a multiple of the batch size.";
+	added_data_.Reshape(num, channels_, height_, width_);
+	added_label_.Reshape(num, 1, 1, 1);
+	// Apply data transformations (mirror, scale, crop...)
+	this->data_transformer_->Transform(datum_vector, &added_data_);
+	// Copy Labels
+	Dtype* top_label = added_label_.mutable_cpu_data();
+	for (int item_id = 0; item_id < num; ++item_id) {
+		top_label[item_id] = datum_vector[item_id].label();
+	}
+	// num_images == batch_size_
+	Dtype* top_data = added_data_.mutable_cpu_data();
+	Reset(top_data, top_label, num);
+	has_new_data_ = true;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
-    const vector<int>& labels) {
-  size_t num = mat_vector.size();
-  CHECK(!has_new_data_) <<
-      "Can't add mat until current data has been consumed.";
-  CHECK_GT(num, 0) << "There is no mat to add";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
-  added_data_.Reshape(num, channels_, height_, width_);
-  added_label_.Reshape(num, 1, 1, 1);
-  // Apply data transformations (mirror, scale, crop...)
-  this->data_transformer_->Transform(mat_vector, &added_data_);
-  // Copy Labels
-  Dtype* top_label = added_label_.mutable_cpu_data();
-  for (int item_id = 0; item_id < num; ++item_id) {
-    top_label[item_id] = labels[item_id];
-  }
-  // num_images == batch_size_
-  Dtype* top_data = added_data_.mutable_cpu_data();
-  Reset(top_data, top_label, num);
-  has_new_data_ = true;
+	const vector<int>& labels) {
+	size_t num = mat_vector.size();
+	CHECK(!has_new_data_) <<
+		"Can't add mat until current data has been consumed.";
+	CHECK_GT(num, 0) << "There is no mat to add";
+	CHECK_EQ(num % batch_size_, 0) <<
+		"The added data must be a multiple of the batch size.";
+	added_data_.Reshape(num, channels_, height_, width_);
+	added_label_.Reshape(num, 1, 1, 1);
+	// Apply data transformations (mirror, scale, crop...)
+	this->data_transformer_->Transform(mat_vector, &added_data_);
+	// Copy Labels
+	Dtype* top_label = added_label_.mutable_cpu_data();
+	for (int item_id = 0; item_id < num; ++item_id) {
+		top_label[item_id] = labels[item_id];
+	}
+	// num_images == batch_size_
+	Dtype* top_data = added_data_.mutable_cpu_data();
+	Reset(top_data, top_label, num);
+	has_new_data_ = true;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
-  CHECK(data);
-  CHECK(labels);
-  CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size";
-  // Warn with transformation parameters since a memory array is meant to
-  // be generic and no transformations are done with Reset().
-  if (this->layer_param_.has_transform_param()) {
-    LOG(WARNING) << this->type() << " does not transform array data on Reset()";
-  }
-  data_ = data;
-  labels_ = labels;
-  n_ = n;
-  pos_ = 0;
+	CHECK(data);
+	CHECK(labels);
+	CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size";
+	// Warn with transformation parameters since a memory array is meant to
+	// be generic and no transformations are done with Reset().
+	if (this->layer_param_.has_transform_param()) {
+		LOG(WARNING) << this->type() << " does not transform array data on Reset()";
+	}
+	data_ = data;
+	labels_ = labels;
+	n_ = n;
+	pos_ = 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
-  CHECK(!has_new_data_) <<
-      "Can't change batch_size until current data has been consumed.";
-  batch_size_ = new_size;
-  added_data_.Reshape(batch_size_, channels_, height_, width_);
-  added_label_.Reshape(batch_size_, 1, 1, 1);
+	CHECK(!has_new_data_) <<
+		"Can't change batch_size until current data has been consumed.";
+	batch_size_ = new_size;
+	added_data_.Reshape(batch_size_, channels_, height_, width_);
+	added_label_.Reshape(batch_size_, 1, 1, 1);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
-  top[0]->Reshape(batch_size_, channels_, height_, width_);
-  top[1]->Reshape(batch_size_, 1, 1, 1);
-  top[0]->set_cpu_data(data_ + pos_ * size_);
-  top[1]->set_cpu_data(labels_ + pos_);
-  pos_ = (pos_ + batch_size_) % n_;
-  if (pos_ == 0)
-    has_new_data_ = false;
+	const vector<Blob<Dtype>*>& top) {
+	CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
+	top[0]->Reshape(batch_size_, channels_, height_, width_);
+	top[1]->Reshape(batch_size_, 1, 1, 1);
+	top[0]->set_cpu_data(data_ + pos_ * size_);
+	top[1]->set_cpu_data(labels_ + pos_);
+	pos_ = (pos_ + batch_size_) % n_;
+	if (pos_ == 0)
+		has_new_data_ = false;
 }
 
-INSTANTIATE_CLASS(MemoryDataLayer);
-REGISTER_LAYER_CLASS(MemoryData);
+INSTANTIATE_CLASS (MemoryDataLayer);
+REGISTER_LAYER_CLASS (MemoryData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 4267a594..5e57cf85 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -10,58 +10,58 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::Reshape(bottom, top);
+	CHECK_EQ(bottom[1]->channels(), 1);
+	CHECK_EQ(bottom[1]->height(), 1);
+	CHECK_EQ(bottom[1]->width(), 1);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* bottom_label = bottom[1]->cpu_data();
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
-  Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    Dtype prob = std::max(
-        bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
-    loss -= log(prob);
-  }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* bottom_label = bottom[1]->cpu_data();
+	int num = bottom[0]->num();
+	int dim = bottom[0]->count() / bottom[0]->num();
+	Dtype loss = 0;
+	for (int i = 0; i < num; ++i) {
+		int label = static_cast<int>(bottom_label[i]);
+		Dtype prob = std::max(
+			bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+		loss -= log(prob);
+	}
+	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    const Dtype* bottom_label = bottom[1]->cpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      int label = static_cast<int>(bottom_label[i]);
-      Dtype prob = std::max(
-          bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
-      bottom_diff[i * dim + label] = scale / prob;
-    }
-  }
+	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		const Dtype* bottom_label = bottom[1]->cpu_data();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		int num = bottom[0]->num();
+		int dim = bottom[0]->count() / bottom[0]->num();
+		caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
+		const Dtype scale = -top[0]->cpu_diff()[0] / num;
+		for (int i = 0; i < num; ++i) {
+			int label = static_cast<int>(bottom_label[i]);
+			Dtype prob = std::max(
+				bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+			bottom_diff[i * dim + label] = scale / prob;
+		}
+	}
 }
 
-INSTANTIATE_CLASS(MultinomialLogisticLossLayer);
-REGISTER_LAYER_CLASS(MultinomialLogisticLoss);
+INSTANTIATE_CLASS (MultinomialLogisticLossLayer);
+REGISTER_LAYER_CLASS (MultinomialLogisticLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index cbeeb150..0bd4e989 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -7,253 +7,254 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  sum_multiplier_.Reshape(1, 1,
-      bottom[0]->height(), bottom[0]->width());
-  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
-  eps_ = this->layer_param_.mvn_param().eps();
+	const vector<Blob<Dtype>*>& top) {
+	top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
+		bottom[0]->height(), bottom[0]->width());
+	mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		1, 1);
+	variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		1, 1);
+	temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+		bottom[0]->height(), bottom[0]->width());
+	sum_multiplier_.Reshape(1, 1,
+		bottom[0]->height(), bottom[0]->width());
+	Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+	caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
+	eps_ = this->layer_param_.mvn_param().eps();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_cpu_data());
-
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
-        sum_multiplier_.cpu_data(), 0.,
-        variance_.mutable_cpu_data());  // E(X^2)
-    caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
-        temp_.mutable_cpu_data());  // (EX)^2
-    caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
-        variance_.mutable_cpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
-
-    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
-
-    // normalize variance
-    caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
-          variance_.mutable_cpu_data());
-
-    caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-          temp_.mutable_cpu_data());
-
-    caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
-  } else {
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-
-    // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
-
-    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	int num;
+	if (this->layer_param_.mvn_param().across_channels())
+		num = bottom[0]->num();
+	else
+		num = bottom[0]->num() * bottom[0]->channels();
+
+	int dim = bottom[0]->count() / num;
+
+	if (this->layer_param_.mvn_param().normalize_variance()) {
+		// put the squares of bottom into temp_
+		caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
+			temp_.mutable_cpu_data());
+
+		// computes variance using var(X) = E(X^2) - (EX)^2
+		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
+			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+		caffe_cpu_gemv < Dtype
+			> (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
+				sum_multiplier_.cpu_data(), 0.,
+				variance_.mutable_cpu_data());  // E(X^2)
+		caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
+			temp_.mutable_cpu_data());  // (EX)^2
+		caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
+			variance_.mutable_cpu_data());  // variance
+
+		// do mean and variance normalization
+		// subtract mean
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+			temp_.mutable_cpu_data());
+
+		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
+
+		// normalize variance
+		caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+			variance_.mutable_cpu_data());
+
+		caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
+
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+			temp_.mutable_cpu_data());
+
+		caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
+	} else {
+		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
+			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+
+		// subtract mean
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+			temp_.mutable_cpu_data());
+
+		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-          bottom_diff);
-    caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
-            bottom_diff);
-
-    caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-        temp_.mutable_cpu_data());
-
-    caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
-  } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->cpu_diff();
+	const Dtype* top_data = top[0]->cpu_data();
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+
+	int num;
+	if (this->layer_param_.mvn_param().across_channels())
+		num = bottom[0]->num();
+	else
+		num = bottom[0]->num() * bottom[0]->channels();
+
+	int dim = bottom[0]->count() / num;
+
+	if (this->layer_param_.mvn_param().normalize_variance()) {
+		caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
+		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
+			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+			bottom_diff);
+		caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
+			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
+			bottom_diff);
+
+		caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+			bottom_diff);
+
+		// put the squares of bottom into temp_
+		caffe_powx(temp_.count(), bottom_data, Dtype(2),
+			temp_.mutable_cpu_data());
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+			temp_.mutable_cpu_data());
+
+		caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
+	} else {
+		caffe_copy(temp_.count(), top_diff, bottom_diff);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-        sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E(X^2)
-    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-        temp_.mutable_gpu_data());  // (EX)^2
-    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-        variance_.mutable_gpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-
-    // normalize variance
-    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-          variance_.mutable_gpu_data());
-
-    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	int num;
+	if (this->layer_param_.mvn_param().across_channels())
+		num = bottom[0]->num();
+	else
+		num = bottom[0]->num() * bottom[0]->channels();
+
+	int dim = bottom[0]->count() / num;
+
+	if (this->layer_param_.mvn_param().normalize_variance()) {
+		// put the squares of bottom into temp_
+		caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
+			temp_.mutable_gpu_data());
+
+		// computes variance using var(X) = E(X^2) - (EX)^2
+		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
+			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+		caffe_gpu_gemv < Dtype
+			> (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
+				sum_multiplier_.gpu_data(), 0.,
+				variance_.mutable_gpu_data());  // E(X^2)
+		caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
+			temp_.mutable_gpu_data());  // (EX)^2
+		caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
+			variance_.mutable_gpu_data());  // variance
+
+		// do mean and variance normalization
+		// subtract mean
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			temp_.mutable_gpu_data());
+
+		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+
+		// normalize variance
+		caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+			variance_.mutable_gpu_data());
+
+		caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
+
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			temp_.mutable_gpu_data());
+
+		caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
+	} else {
+		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
+			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+
+		// subtract mean
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			temp_.mutable_gpu_data());
+
+		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          bottom_diff);
-    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-            bottom_diff);
-
-    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->gpu_diff();
+	const Dtype* top_data = top[0]->gpu_data();
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+	int num;
+	if (this->layer_param_.mvn_param().across_channels())
+		num = bottom[0]->num();
+	else
+		num = bottom[0]->num() * bottom[0]->channels();
+
+	int dim = bottom[0]->count() / num;
+
+	if (this->layer_param_.mvn_param().normalize_variance()) {
+		caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
+		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
+			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			bottom_diff);
+		caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
+			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
+			bottom_diff);
+
+		caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+			bottom_diff);
+
+		// put the squares of bottom into temp_
+		caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
+			temp_.mutable_gpu_data());
+
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
+			variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			temp_.mutable_gpu_data());
+
+		caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
+	} else {
+		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff,
+			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+			temp_.mutable_gpu_data());
+		caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(MVNLayer);
 #endif
 
-INSTANTIATE_CLASS(MVNLayer);
-REGISTER_LAYER_CLASS(MVN);
+INSTANTIATE_CLASS (MVNLayer);
+REGISTER_LAYER_CLASS (MVN);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp
index ba67b438..2a0a2088 100644
--- a/src/caffe/layers/neuron_layer.cpp
+++ b/src/caffe/layers/neuron_layer.cpp
@@ -5,12 +5,12 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void NeuronLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->ReshapeLike(*bottom[0]);
+	const vector<Blob<Dtype>*>& top) {
+	top[0]->ReshapeLike(*bottom[0]);
 }
 
-INSTANTIATE_CLASS(NeuronLayer);
+INSTANTIATE_CLASS (NeuronLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index ff86400b..d66a24f6 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -13,405 +13,404 @@ namespace caffe {
 using std::min;
 using std::max;
 
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  PoolingParameter pool_param = this->layer_param_.pooling_param();
-  if (pool_param.global_pooling()) {
-    CHECK(!(pool_param.has_kernel_size() ||
-      pool_param.has_kernel_h() || pool_param.has_kernel_w()))
-      << "With Global_pooling: true Filter size cannot specified";
-  } else {
-    CHECK(!pool_param.has_kernel_size() !=
-      !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-    CHECK(pool_param.has_kernel_size() ||
-      (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  }
-  CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
-      && pool_param.has_pad_w())
-      || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
-      && pool_param.has_stride_w())
-      || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  global_pooling_ = pool_param.global_pooling();
-  if (global_pooling_) {
-    kernel_h_ = bottom[0]->height();
-    kernel_w_ = bottom[0]->width();
-  } else {
-    if (pool_param.has_kernel_size()) {
-      kernel_h_ = kernel_w_ = pool_param.kernel_size();
-    } else {
-      kernel_h_ = pool_param.kernel_h();
-      kernel_w_ = pool_param.kernel_w();
-    }
-  }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!pool_param.has_pad_h()) {
-    pad_h_ = pad_w_ = pool_param.pad();
-  } else {
-    pad_h_ = pool_param.pad_h();
-    pad_w_ = pool_param.pad_w();
-  }
-  if (!pool_param.has_stride_h()) {
-    stride_h_ = stride_w_ = pool_param.stride();
-  } else {
-    stride_h_ = pool_param.stride_h();
-    stride_w_ = pool_param.stride_w();
-  }
-  if (global_pooling_) {
-    CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-      << "With Global_pooling: true; only pad = 0 and stride = 1";
-  }
-  if (pad_h_ != 0 || pad_w_ != 0) {
-    CHECK(this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_AVE
-        || this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_MAX)
-        << "Padding implemented only for average and max pooling.";
-    CHECK_LT(pad_h_, kernel_h_);
-    CHECK_LT(pad_w_, kernel_w_);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	PoolingParameter pool_param = this->layer_param_.pooling_param();
+	if (pool_param.global_pooling()) {
+		CHECK(!(pool_param.has_kernel_size() ||
+			pool_param.has_kernel_h() || pool_param.has_kernel_w()))
+			<< "With Global_pooling: true Filter size cannot specified";
+	} else {
+		CHECK(!pool_param.has_kernel_size() !=
+			!(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+		CHECK(pool_param.has_kernel_size() ||
+			(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+			<< "For non-square filters both kernel_h and kernel_w are required.";
+	}
+	CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
+		&& pool_param.has_pad_w())
+		|| (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
+		<< "pad is pad OR pad_h and pad_w are required.";
+	CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
+		&& pool_param.has_stride_w())
+		|| (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
+		<< "Stride is stride OR stride_h and stride_w are required.";
+	global_pooling_ = pool_param.global_pooling();
+	if (global_pooling_) {
+		kernel_h_ = bottom[0]->height();
+		kernel_w_ = bottom[0]->width();
+	} else {
+		if (pool_param.has_kernel_size()) {
+			kernel_h_ = kernel_w_ = pool_param.kernel_size();
+		} else {
+			kernel_h_ = pool_param.kernel_h();
+			kernel_w_ = pool_param.kernel_w();
+		}
+	}
+	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+	if (!pool_param.has_pad_h()) {
+		pad_h_ = pad_w_ = pool_param.pad();
+	} else {
+		pad_h_ = pool_param.pad_h();
+		pad_w_ = pool_param.pad_w();
+	}
+	if (!pool_param.has_stride_h()) {
+		stride_h_ = stride_w_ = pool_param.stride();
+	} else {
+		stride_h_ = pool_param.stride_h();
+		stride_w_ = pool_param.stride_w();
+	}
+	if (global_pooling_) {
+		CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+			<< "With Global_pooling: true; only pad = 0 and stride = 1";
+	}
+	if (pad_h_ != 0 || pad_w_ != 0) {
+		CHECK(this->layer_param_.pooling_param().pool()
+			== PoolingParameter_PoolMethod_AVE
+			|| this->layer_param_.pooling_param().pool()
+				== PoolingParameter_PoolMethod_MAX)
+			<< "Padding implemented only for average and max pooling.";
+		CHECK_LT(pad_h_, kernel_h_);
+		CHECK_LT(pad_w_, kernel_w_);
+	}
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  if (global_pooling_) {
-    kernel_h_ = bottom[0]->height();
-    kernel_w_ = bottom[0]->width();
-  }
-  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
-  if (pad_h_ || pad_w_) {
-    // If we have padding, ensure that the last pooling starts strictly
-    // inside the image (instead of at the padding); otherwise clip the last.
-    if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
-      --pooled_height_;
-    }
-    if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
-      --pooled_width_;
-    }
-    CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
-    CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
-  }
-  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
-  if (top.size() > 1) {
-    top[1]->ReshapeLike(*top[0]);
-  }
-  // If max pooling, we will initialize the vector index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX && top.size() == 1) {
-    max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-        pooled_width_);
-  }
-  // If stochastic pooling, we will initialize the random index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_STOCHASTIC) {
-    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	channels_ = bottom[0]->channels();
+	height_ = bottom[0]->height();
+	width_ = bottom[0]->width();
+	if (global_pooling_) {
+		kernel_h_ = bottom[0]->height();
+		kernel_w_ = bottom[0]->width();
+	}
+	pooled_height_ = static_cast<int>(ceil(static_cast<float>(
+		height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+	pooled_width_ = static_cast<int>(ceil(static_cast<float>(
+		width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+	if (pad_h_ || pad_w_) {
+		// If we have padding, ensure that the last pooling starts strictly
+		// inside the image (instead of at the padding); otherwise clip the last.
+		if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
+			--pooled_height_;
+		}
+		if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
+			--pooled_width_;
+		}
+		CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
+		CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
+	}
+	top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
+		pooled_width_);
+	if (top.size() > 1) {
+		top[1]->ReshapeLike(*top[0]);
+	}
+	// If max pooling, we will initialize the vector index part.
+	if (this->layer_param_.pooling_param().pool() ==
+		PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+		max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+			pooled_width_);
+	}
+	// If stochastic pooling, we will initialize the random index part.
+	if (this->layer_param_.pooling_param().pool() ==
+		PoolingParameter_PoolMethod_STOCHASTIC) {
+		rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+			pooled_width_);
+	}
 }
 
 // TODO(Yangqing): Is there a faster way to do pooling in the channel-first
 // case?
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int top_count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;  // suppress warnings about uninitalized variables
-  Dtype* top_mask = NULL;
-  // Different pooling methods. We explicitly do the switch outside the for
-  // loop to save time, although this results in more code.
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    // Initialize
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_cpu_data();
-      caffe_set(top_count, Dtype(-1), top_mask);
-    } else {
-      mask = max_idx_.mutable_cpu_data();
-      caffe_set(top_count, -1, mask);
-    }
-    caffe_set(top_count, Dtype(-FLT_MAX), top_data);
-    // The main loop
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_);
-            int wend = min(wstart + kernel_w_, width_);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            const int pool_index = ph * pooled_width_ + pw;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width_ + w;
-                if (bottom_data[index] > top_data[pool_index]) {
-                  top_data[pool_index] = bottom_data[index];
-                  if (use_top_mask) {
-                    top_mask[pool_index] = static_cast<Dtype>(index);
-                  } else {
-                    mask[pool_index] = index;
-                  }
-                }
-              }
-            }
-          }
-        }
-        // compute offset
-        bottom_data += bottom[0]->offset(0, 1);
-        top_data += top[0]->offset(0, 1);
-        if (use_top_mask) {
-          top_mask += top[0]->offset(0, 1);
-        } else {
-          mask += top[0]->offset(0, 1);
-        }
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    for (int i = 0; i < top_count; ++i) {
-      top_data[i] = 0;
-    }
-    // The main loop
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_ + pad_h_);
-            int wend = min(wstart + kernel_w_, width_ + pad_w_);
-            int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height_);
-            wend = min(wend, width_);
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                top_data[ph * pooled_width_ + pw] +=
-                    bottom_data[h * width_ + w];
-              }
-            }
-            top_data[ph * pooled_width_ + pw] /= pool_size;
-          }
-        }
-        // compute offset
-        bottom_data += bottom[0]->offset(0, 1);
-        top_data += top[0]->offset(0, 1);
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    NOT_IMPLEMENTED;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int top_count = top[0]->count();
+	// We'll output the mask to top[1] if it's of size >1.
+	const bool use_top_mask = top.size() > 1;
+	int* mask = NULL;  // suppress warnings about uninitalized variables
+	Dtype* top_mask = NULL;
+	// Different pooling methods. We explicitly do the switch outside the for
+	// loop to save time, although this results in more code.
+	switch (this->layer_param_.pooling_param().pool()) {
+		case PoolingParameter_PoolMethod_MAX:
+			// Initialize
+			if (use_top_mask) {
+				top_mask = top[1]->mutable_cpu_data();
+				caffe_set(top_count, Dtype(-1), top_mask);
+			} else {
+				mask = max_idx_.mutable_cpu_data();
+				caffe_set(top_count, -1, mask);
+			}
+			caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+			// The main loop
+			for (int n = 0; n < bottom[0]->num(); ++n) {
+				for (int c = 0; c < channels_; ++c) {
+					for (int ph = 0; ph < pooled_height_; ++ph) {
+						for (int pw = 0; pw < pooled_width_; ++pw) {
+							int hstart = ph * stride_h_ - pad_h_;
+							int wstart = pw * stride_w_ - pad_w_;
+							int hend = min(hstart + kernel_h_, height_);
+							int wend = min(wstart + kernel_w_, width_);
+							hstart = max(hstart, 0);
+							wstart = max(wstart, 0);
+							const int pool_index = ph * pooled_width_ + pw;
+							for (int h = hstart; h < hend; ++h) {
+								for (int w = wstart; w < wend; ++w) {
+									const int index = h * width_ + w;
+									if (bottom_data[index] > top_data[pool_index]) {
+										top_data[pool_index] = bottom_data[index];
+										if (use_top_mask) {
+											top_mask[pool_index] = static_cast<Dtype>(index);
+										} else {
+											mask[pool_index] = index;
+										}
+									}
+								}
+							}
+						}
+					}
+					// compute offset
+					bottom_data += bottom[0]->offset(0, 1);
+					top_data += top[0]->offset(0, 1);
+					if (use_top_mask) {
+						top_mask += top[0]->offset(0, 1);
+					} else {
+						mask += top[0]->offset(0, 1);
+					}
+				}
+			}
+			break;
+		case PoolingParameter_PoolMethod_AVE:
+			for (int i = 0; i < top_count; ++i) {
+				top_data[i] = 0;
+			}
+			// The main loop
+			for (int n = 0; n < bottom[0]->num(); ++n) {
+				for (int c = 0; c < channels_; ++c) {
+					for (int ph = 0; ph < pooled_height_; ++ph) {
+						for (int pw = 0; pw < pooled_width_; ++pw) {
+							int hstart = ph * stride_h_ - pad_h_;
+							int wstart = pw * stride_w_ - pad_w_;
+							int hend = min(hstart + kernel_h_, height_ + pad_h_);
+							int wend = min(wstart + kernel_w_, width_ + pad_w_);
+							int pool_size = (hend - hstart) * (wend - wstart);
+							hstart = max(hstart, 0);
+							wstart = max(wstart, 0);
+							hend = min(hend, height_);
+							wend = min(wend, width_);
+							for (int h = hstart; h < hend; ++h) {
+								for (int w = wstart; w < wend; ++w) {
+									top_data[ph * pooled_width_ + pw] +=
+										bottom_data[h * width_ + w];
+								}
+							}
+							top_data[ph * pooled_width_ + pw] /= pool_size;
+						}
+					}
+					// compute offset
+					bottom_data += bottom[0]->offset(0, 1);
+					top_data += top[0]->offset(0, 1);
+				}
+			}
+			break;
+		case PoolingParameter_PoolMethod_STOCHASTIC:
+			NOT_IMPLEMENTED;
+			break;
+		default:
+			LOG(FATAL) << "Unknown pooling method.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  // Different pooling methods. We explicitly do the switch outside the for
-  // loop to save time, although this results in more codes.
-  caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;  // suppress warnings about uninitialized variables
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    // The main loop
-    if (use_top_mask) {
-      top_mask = top[1]->cpu_data();
-    } else {
-      mask = max_idx_.cpu_data();
-    }
-    for (int n = 0; n < top[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            const int index = ph * pooled_width_ + pw;
-            const int bottom_index =
-                use_top_mask ? top_mask[index] : mask[index];
-            bottom_diff[bottom_index] += top_diff[index];
-          }
-        }
-        bottom_diff += bottom[0]->offset(0, 1);
-        top_diff += top[0]->offset(0, 1);
-        if (use_top_mask) {
-          top_mask += top[0]->offset(0, 1);
-        } else {
-          mask += top[0]->offset(0, 1);
-        }
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // The main loop
-    for (int n = 0; n < top[0]->num(); ++n) {
-      for (int c = 0; c < channels_; ++c) {
-        for (int ph = 0; ph < pooled_height_; ++ph) {
-          for (int pw = 0; pw < pooled_width_; ++pw) {
-            int hstart = ph * stride_h_ - pad_h_;
-            int wstart = pw * stride_w_ - pad_w_;
-            int hend = min(hstart + kernel_h_, height_ + pad_h_);
-            int wend = min(wstart + kernel_w_, width_ + pad_w_);
-            int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height_);
-            wend = min(wend, width_);
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                bottom_diff[h * width_ + w] +=
-                  top_diff[ph * pooled_width_ + pw] / pool_size;
-              }
-            }
-          }
-        }
-        // offset
-        bottom_diff += bottom[0]->offset(0, 1);
-        top_diff += top[0]->offset(0, 1);
-      }
-    }
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    NOT_IMPLEMENTED;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	// Different pooling methods. We explicitly do the switch outside the for
+	// loop to save time, although this results in more codes.
+	caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
+	// We'll output the mask to top[1] if it's of size >1.
+	const bool use_top_mask = top.size() > 1;
+	const int* mask = NULL;  // suppress warnings about uninitialized variables
+	const Dtype* top_mask = NULL;
+	switch (this->layer_param_.pooling_param().pool()) {
+		case PoolingParameter_PoolMethod_MAX:
+			// The main loop
+			if (use_top_mask) {
+				top_mask = top[1]->cpu_data();
+			} else {
+				mask = max_idx_.cpu_data();
+			}
+			for (int n = 0; n < top[0]->num(); ++n) {
+				for (int c = 0; c < channels_; ++c) {
+					for (int ph = 0; ph < pooled_height_; ++ph) {
+						for (int pw = 0; pw < pooled_width_; ++pw) {
+							const int index = ph * pooled_width_ + pw;
+							const int bottom_index =
+								use_top_mask ? top_mask[index] : mask[index];
+							bottom_diff[bottom_index] += top_diff[index];
+						}
+					}
+					bottom_diff += bottom[0]->offset(0, 1);
+					top_diff += top[0]->offset(0, 1);
+					if (use_top_mask) {
+						top_mask += top[0]->offset(0, 1);
+					} else {
+						mask += top[0]->offset(0, 1);
+					}
+				}
+			}
+			break;
+		case PoolingParameter_PoolMethod_AVE:
+			// The main loop
+			for (int n = 0; n < top[0]->num(); ++n) {
+				for (int c = 0; c < channels_; ++c) {
+					for (int ph = 0; ph < pooled_height_; ++ph) {
+						for (int pw = 0; pw < pooled_width_; ++pw) {
+							int hstart = ph * stride_h_ - pad_h_;
+							int wstart = pw * stride_w_ - pad_w_;
+							int hend = min(hstart + kernel_h_, height_ + pad_h_);
+							int wend = min(wstart + kernel_w_, width_ + pad_w_);
+							int pool_size = (hend - hstart) * (wend - wstart);
+							hstart = max(hstart, 0);
+							wstart = max(wstart, 0);
+							hend = min(hend, height_);
+							wend = min(wend, width_);
+							for (int h = hstart; h < hend; ++h) {
+								for (int w = wstart; w < wend; ++w) {
+									bottom_diff[h * width_ + w] +=
+										top_diff[ph * pooled_width_ + pw] / pool_size;
+								}
+							}
+						}
+					}
+					// offset
+					bottom_diff += bottom[0]->offset(0, 1);
+					top_diff += top[0]->offset(0, 1);
+				}
+			}
+			break;
+		case PoolingParameter_PoolMethod_STOCHASTIC:
+			NOT_IMPLEMENTED;
+			break;
+		default:
+			LOG(FATAL) << "Unknown pooling method.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-    //Forward_cpu(bottom, top);
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;
-  Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_gpu_data();
-    } else {
-      mask = max_idx_.mutable_gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
-    break;
- case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    if (this->phase_ == TRAIN) {
-      // We need to create the random index as well.
-      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-                            rand_idx_.mutable_gpu_data());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top_data);
-    } else {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
+	const vector<Blob<Dtype>*>& top) {
+	//Forward_cpu(bottom, top);
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	int count = top[0]->count();
+	// We'll output the mask to top[1] if it's of size >1.
+	const bool use_top_mask = top.size() > 1;
+	int* mask = NULL;
+	Dtype* top_mask = NULL;
+	switch (this->layer_param_.pooling_param().pool()) {
+		case PoolingParameter_PoolMethod_MAX:
+			if (use_top_mask) {
+				top_mask = top[1]->mutable_gpu_data();
+			} else {
+				mask = max_idx_.mutable_gpu_data();
+			}
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_,
+				height_, width_, pooled_height_, pooled_width_, kernel_h_,
+				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
+				mask, top_mask);
+			break;
+		case PoolingParameter_PoolMethod_AVE:
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
+				height_, width_, pooled_height_, pooled_width_, kernel_h_,
+				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
+			break;
+		case PoolingParameter_PoolMethod_STOCHASTIC:
+			if (this->phase_ == TRAIN) {
+				// We need to create the random index as well.
+				caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
+					rand_idx_.mutable_gpu_data());
+				// NOLINT_NEXT_LINE(whitespace/operators)
+				StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
+					height_, width_, pooled_height_, pooled_width_, kernel_h_,
+					kernel_w_, stride_h_, stride_w_,
+					rand_idx_.mutable_gpu_data(), top_data);
+			} else {
+				// NOLINT_NEXT_LINE(whitespace/operators)
+				StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
+					height_, width_, pooled_height_, pooled_width_, kernel_h_,
+					kernel_w_, stride_h_, stride_w_, top_data);
+			}
+			break;
+		default:
+			LOG(FATAL) << "Unknown pooling method.";
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-    //Backward_cpu(top, propagate_down, bottom);
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int count = bottom[0]->count();
-  caffe_gpu_set(count, Dtype(0.), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->gpu_data();
-    } else {
-      mask = max_idx_.gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward(count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-     StoPoolBackward(count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-        bottom_diff);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	//Backward_cpu(top, propagate_down, bottom);
+	if (!propagate_down[0]) {
+		return;
+	}
+	const Dtype* top_diff = top[0]->gpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	const int count = bottom[0]->count();
+	caffe_gpu_set(count, Dtype(0.), bottom_diff);
+	// We'll output the mask to top[1] if it's of size >1.
+	const bool use_top_mask = top.size() > 1;
+	const int* mask = NULL;
+	const Dtype* top_mask = NULL;
+	switch (this->layer_param_.pooling_param().pool()) {
+		case PoolingParameter_PoolMethod_MAX:
+			if (use_top_mask) {
+				top_mask = top[1]->gpu_data();
+			} else {
+				mask = max_idx_.gpu_data();
+			}
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
+				height_, width_, pooled_height_, pooled_width_,
+				kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
+				bottom_diff);
+			break;
+		case PoolingParameter_PoolMethod_AVE:
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			AvePoolBackward(count, top_diff, top[0]->num(), channels_,
+				height_, width_, pooled_height_, pooled_width_, kernel_h_,
+				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+			break;
+		case PoolingParameter_PoolMethod_STOCHASTIC:
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			StoPoolBackward(count, rand_idx_.gpu_data(), top_diff,
+				top[0]->num(), channels_, height_, width_, pooled_height_,
+				pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
+				bottom_diff);
+			break;
+		default:
+			LOG(FATAL) << "Unknown pooling method.";
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(PoolingLayer);
 #endif
 
-INSTANTIATE_CLASS(PoolingLayer);
+INSTANTIATE_CLASS (PoolingLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index d3c374f1..e4a3e456 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -7,175 +7,173 @@
 #include "caffe/util/ocl_util.hpp"
 #include "caffe/util/ocl_wrapper.hpp"
 
-
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  power_ = this->layer_param_.power_param().power();
-  scale_ = this->layer_param_.power_param().scale();
-  shift_ = this->layer_param_.power_param().shift();
-  diff_scale_ = power_  * scale_;
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	power_ = this->layer_param_.power_param().power();
+	scale_ = this->layer_param_.power_param().scale();
+	shift_ = this->layer_param_.power_param().shift();
+	diff_scale_ = power_ * scale_;
 }
 
-
 // Compute y = (shift + scale * x)^power
-template <typename Dtype>
+template<typename Dtype>
 void PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_set(count, value, top_data);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  caffe_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_powx(count, top_data, power_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	// Special case where we can ignore the input: scale or power is 0.
+	if (diff_scale_ == Dtype(0)) {
+		Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+		caffe_set(count, value, top_data);
+		return;
+	}
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	caffe_copy(count, bottom_data, top_data);
+	if (scale_ != Dtype(1)) {
+		caffe_scal(count, scale_, top_data);
+	}
+	if (shift_ != Dtype(0)) {
+		caffe_add_scalar(count, shift_, top_data);
+	}
+	if (power_ != Dtype(1)) {
+		caffe_powx(count, top_data, power_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = bottom[0]->cpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->cpu_data();
-        caffe_div(count, top_data, bottom_data, bottom_diff);
-        caffe_scal(count, power_, bottom_diff);
-      } else {
-        caffe_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->cpu_data();
-        caffe_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    if (diff_scale_ != Dtype(0)) {
-      caffe_mul(count, top_diff, bottom_diff, bottom_diff);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const int count = bottom[0]->count();
+		const Dtype* top_diff = top[0]->cpu_diff();
+		if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+			caffe_set(count, diff_scale_, bottom_diff);
+		} else {
+			const Dtype* bottom_data = bottom[0]->cpu_data();
+			// Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+			//               = diff_scale * y / (shift + scale * x)
+			if (power_ == Dtype(2)) {
+				// Special case for y = (shift + scale * x)^2
+				//     -> dy/dx = 2 * scale * (shift + scale * x)
+				//              = diff_scale * shift + diff_scale * scale * x
+				caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
+					Dtype(0), bottom_diff);
+				if (shift_ != Dtype(0)) {
+					caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+				}
+			} else if (shift_ == Dtype(0)) {
+				// Special case for y = (scale * x)^power
+				//     -> dy/dx = scale * power * (scale * x)^(power - 1)
+				//              = scale * power * (scale * x)^power * (scale * x)^(-1)
+				//              = power * y / x
+				const Dtype* top_data = top[0]->cpu_data();
+				caffe_div(count, top_data, bottom_data, bottom_diff);
+				caffe_scal(count, power_, bottom_diff);
+			} else {
+				caffe_copy(count, bottom_data, bottom_diff);
+				if (scale_ != Dtype(1)) {
+					caffe_scal(count, scale_, bottom_diff);
+				}
+				if (shift_ != Dtype(0)) {
+					caffe_add_scalar(count, shift_, bottom_diff);
+				}
+				const Dtype* top_data = top[0]->cpu_data();
+				caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff);
+				if (diff_scale_ != Dtype(1)) {
+					caffe_scal(count, diff_scale_, bottom_diff);
+				}
+			}
+		}
+		if (diff_scale_ != Dtype(0)) {
+			caffe_mul(count, top_diff, bottom_diff, bottom_diff);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    ocl_memset(top_data, value, count);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_gpu_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	// Special case where we can ignore the input: scale or power is 0.
+	if (diff_scale_ == Dtype(0)) {
+		Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+		ocl_memset(top_data, value, count);
+		return;
+	}
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	caffe_gpu_copy(count, bottom_data, top_data);
+	if (scale_ != Dtype(1)) {
+		caffe_gpu_scal(count, scale_, top_data);
+	}
+	if (shift_ != Dtype(0)) {
+		caffe_gpu_add_scalar(count, shift_, top_data);
+	}
+	if (power_ != Dtype(1)) {
+		caffe_gpu_powx(count, top_data, power_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      ocl_memset(bottom_diff, diff_scale_,count);
-    } else {
-      const Dtype* bottom_data = bottom[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_gpu_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-            caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const int count = bottom[0]->count();
+		const Dtype* top_diff = top[0]->gpu_diff();
+		if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+			ocl_memset(bottom_diff, diff_scale_, count);
+		} else {
+			const Dtype* bottom_data = bottom[0]->gpu_data();
+			// Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+			//               = diff_scale * y / (shift + scale * x)
+			if (power_ == Dtype(2)) {
+				// Special case for y = (shift + scale * x)^2
+				//     -> dy/dx = 2 * scale * (shift + scale * x)
+				//              = diff_scale * shift + diff_scale * scale * x
+				caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
+					Dtype(0), bottom_diff);
+				if (shift_ != Dtype(0)) {
+					caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+				}
+			} else if (shift_ == Dtype(0)) {
+				// Special case for y = (scale * x)^power
+				//     -> dy/dx = scale * power * (scale * x)^(power - 1)
+				//              = scale * power * (scale * x)^power * (scale * x)^(-1)
+				//              = power * y / x
+				const Dtype* top_data = top[0]->gpu_data();
+				caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+				caffe_gpu_scal(count, power_, bottom_diff);
+			} else {
+				caffe_gpu_copy(count, bottom_data, bottom_diff);
+				if (scale_ != Dtype(1)) {
+					caffe_gpu_scal(count, scale_, bottom_diff);
+				}
+				if (shift_ != Dtype(0)) {
+					caffe_gpu_add_scalar(count, shift_, bottom_diff);
+				}
+				const Dtype* top_data = top[0]->gpu_data();
+				caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
+				if (diff_scale_ != Dtype(1)) {
+					caffe_gpu_scal(count, diff_scale_, bottom_diff);
+				}
+			}
+		}
+		caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(PowerLayer);
 #endif
 
-INSTANTIATE_CLASS(PowerLayer);
-REGISTER_LAYER_CLASS(Power);
+INSTANTIATE_CLASS (PowerLayer);
+REGISTER_LAYER_CLASS (Power);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 426a0cad..5332a178 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -7,203 +7,205 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom[0]->num_axes(), 2)
-      << "Number of axes of bottom blob must be >=2.";
-  PReLUParameter prelu_param = this->layer_param().prelu_param();
-  int channels = bottom[0]->channels();
-  channel_shared_ = prelu_param.channel_shared();
-  if (this->blobs_.size() > 0) {
-    LOG(INFO) << "Skipping parameter initialization";
-  } else {
-    this->blobs_.resize(1);
-    if (channel_shared_) {
-      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(0)));
-    } else {
-      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
-    }
-    shared_ptr<Filler<Dtype> > filler;
-    if (prelu_param.has_filler()) {
-      filler.reset(GetFiller<Dtype>(prelu_param.filler()));
-    } else {
-      FillerParameter filler_param;
-      filler_param.set_type("constant");
-      filler_param.set_value(0.25);
-      filler.reset(GetFiller<Dtype>(filler_param));
-    }
-    filler->Fill(this->blobs_[0].get());
-  }
-  if (channel_shared_) {
-    CHECK_EQ(this->blobs_[0]->count(), 1)
-        << "Negative slope size is inconsistent with prototxt config";
-  } else {
-    CHECK_EQ(this->blobs_[0]->count(), channels)
-        << "Negative slope size is inconsistent with prototxt config";
-  }
-
-  // Propagate gradients to the parameters (as directed by backward pass).
-  this->param_propagate_down_.resize(this->blobs_.size(), true);
-  multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
-  backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
-  caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_GE(bottom[0]->num_axes(), 2)
+		<< "Number of axes of bottom blob must be >=2.";
+	PReLUParameter prelu_param = this->layer_param().prelu_param();
+	int channels = bottom[0]->channels();
+	channel_shared_ = prelu_param.channel_shared();
+	if (this->blobs_.size() > 0) {
+		LOG(INFO) << "Skipping parameter initialization";
+	} else {
+		this->blobs_.resize(1);
+		if (channel_shared_) {
+			this->blobs_[0].reset(new Blob<Dtype>(vector<int>(0)));
+		} else {
+			this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
+		}
+		shared_ptr < Filler<Dtype> > filler;
+		if (prelu_param.has_filler()) {
+			filler.reset(GetFiller < Dtype > (prelu_param.filler()));
+		} else {
+			FillerParameter filler_param;
+			filler_param.set_type("constant");
+			filler_param.set_value(0.25);
+			filler.reset(GetFiller < Dtype > (filler_param));
+		}
+		filler->Fill(this->blobs_[0].get());
+	}
+	if (channel_shared_) {
+		CHECK_EQ(this->blobs_[0]->count(), 1)
+			<< "Negative slope size is inconsistent with prototxt config";
+	} else {
+		CHECK_EQ(this->blobs_[0]->count(), channels)
+			<< "Negative slope size is inconsistent with prototxt config";
+	}
+
+	// Propagate gradients to the parameters (as directed by backward pass).
+	this->param_propagate_down_.resize(this->blobs_.size(), true);
+	multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
+	backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
+	caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom[0]->num_axes(), 2)
-      << "Number of axes of bottom blob must be >=2.";
-  top[0]->ReshapeLike(*bottom[0]);
-  if (bottom[0] == top[0]) {
-    // For in-place computation
-    bottom_memory_.ReshapeLike(*bottom[0]);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_GE(bottom[0]->num_axes(), 2)
+		<< "Number of axes of bottom blob must be >=2.";
+	top[0]->ReshapeLike(*bottom[0]);
+	if (bottom[0] == top[0]) {
+		// For in-place computation
+		bottom_memory_.ReshapeLike(*bottom[0]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->cpu_data();
-
-  // For in-place computation
-  if (bottom[0] == top[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data());
-  }
-
-  // if channel_shared, channel index in the following computation becomes
-  // always zero.
-  const int div_factor = channel_shared_ ? channels : 1;
-  for (int i = 0; i < count; ++i) {
-    int c = (i / dim) % channels / div_factor;
-    top_data[i] = std::max(bottom_data[i], Dtype(0))
-        + slope_data[c] * std::min(bottom_data[i], Dtype(0));
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	const int dim = bottom[0]->count(2);
+	const int channels = bottom[0]->channels();
+	const Dtype* slope_data = this->blobs_[0]->cpu_data();
+
+	// For in-place computation
+	if (bottom[0] == top[0]) {
+		caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data());
+	}
+
+	// if channel_shared, channel index in the following computation becomes
+	// always zero.
+	const int div_factor = channel_shared_ ? channels : 1;
+	for (int i = 0; i < count; ++i) {
+		int c = (i / dim) % channels / div_factor;
+		top_data[i] = std::max(bottom_data[i], Dtype(0))
+			+ slope_data[c] * std::min(bottom_data[i], Dtype(0));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* slope_data = this->blobs_[0]->cpu_data();
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.cpu_data();
-  }
-
-  // if channel_shared, channel index in the following computation becomes
-  // always zero.
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // Propagte to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
-      slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
-      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-          + slope_data[c] * (bottom_data[i] <= 0));
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* slope_data = this->blobs_[0]->cpu_data();
+	const Dtype* top_diff = top[0]->cpu_diff();
+	const int count = bottom[0]->count();
+	const int dim = bottom[0]->count(2);
+	const int channels = bottom[0]->channels();
+
+	// For in-place computation
+	if (top[0] == bottom[0]) {
+		bottom_data = bottom_memory_.cpu_data();
+	}
+
+	// if channel_shared, channel index in the following computation becomes
+	// always zero.
+	const int div_factor = channel_shared_ ? channels : 1;
+
+	// Propagte to param
+	// Since to write bottom diff will affect top diff if top and bottom blobs
+	// are identical (in-place computaion), we first compute param backward to
+	// keep top_diff unchanged.
+	if (this->param_propagate_down_[0]) {
+		Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
+		for (int i = 0; i < count; ++i) {
+			int c = (i / dim) % channels / div_factor;
+			slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0);
+		}
+	}
+	// Propagate to bottom
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		for (int i = 0; i < count; ++i) {
+			int c = (i / dim) % channels / div_factor;
+			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
+				+ slope_data[c] * (bottom_data[i] <= 0));
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-  
-  if (top[0] == bottom[0]) {
-    caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-  PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, div_factor);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	const int dim = bottom[0]->count(2);
+	const int channels = bottom[0]->channels();
+	const Dtype* slope_data = this->blobs_[0]->gpu_data();
+	const int div_factor = channel_shared_ ? channels : 1;
+
+	if (top[0] == bottom[0]) {
+		caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
+	}
+	PReLUForward(count, channels, dim, bottom_data, top_data, slope_data,
+		div_factor);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-    Dtype dsum = 0.;
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward(
-          cdim, top_diff, top[0]->offset(n),
-          bottom_data, bottom[0]->offset(n),
-          backward_buff_.mutable_gpu_diff());
-      if (channel_shared_) {
-        Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        dsum += d;
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-            slope_diff);
-      }
-    }
-    if (channel_shared_) {
-      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
-        div_factor);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	const Dtype* top_diff = top[0]->gpu_diff();
+	const int count = bottom[0]->count();
+	const int dim = bottom[0]->count(2);
+	const int channels = bottom[0]->channels();
+
+	if (top[0] == bottom[0]) {
+		bottom_data = bottom_memory_.gpu_data();
+	}
+
+	// Propagate to param
+	// Since to write bottom diff will affect top diff if top and bottom blobs
+	// are identical (in-place computaion), we first compute param backward to
+	// keep top_diff unchanged.
+	if (this->param_propagate_down_[0]) {
+		Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
+		int cdim = channels * dim;
+		Dtype dsum = 0.;
+		for (int n = 0; n < bottom[0]->num(); ++n) {
+			// compute element-wise diff
+			// NOLINT_NEXT_LINE(whitespace/operators)
+			PReLUParamBackward(
+				cdim, top_diff, top[0]->offset(n),
+				bottom_data, bottom[0]->offset(n),
+				backward_buff_.mutable_gpu_diff());
+			if (channel_shared_) {
+				Dtype d;
+				caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(),
+					multiplier_.gpu_data(), &d);
+				dsum += d;
+			} else {
+				caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1.,
+					backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+					slope_diff);
+			}
+		}
+		if (channel_shared_) {
+			caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+		}
+	}
+	// Propagate to bottom
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const Dtype* slope_data = this->blobs_[0]->gpu_data();
+		int div_factor = channel_shared_ ? channels : 1;
+		// NOLINT_NEXT_LINE(whitespace/operators)
+		PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff,
+			slope_data,
+			div_factor);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(PReLULayer);
 #endif
 
-INSTANTIATE_CLASS(PReLULayer);
-REGISTER_LAYER_CLASS(PReLU);
+INSTANTIATE_CLASS (PReLULayer);
+REGISTER_LAYER_CLASS (PReLU);
 }  // namespace caffe
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 4003ddd1..32ea4bc0 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -8,206 +8,210 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  op_ = this->layer_param_.reduction_param().operation();
+	const vector<Blob<Dtype>*>& top) {
+	op_ = this->layer_param_.reduction_param().operation();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  axis_ = bottom[0]->CanonicalAxisIndex(
-      this->layer_param_.reduction_param().axis());
-  // In the output, we'll keep all axes up to the reduction axis, but
-  // throw away any after that.
-  // Note: currently reducing along non-tail axes is not supported; otherwise,
-  // we'd need to also copy any axes following an "end_axis".
-  vector<int> top_shape(bottom[0]->shape().begin(),
-                        bottom[0]->shape().begin() + axis_);
-  top[0]->Reshape(top_shape);
-  num_ = bottom[0]->count(0, axis_);
-  dim_ = bottom[0]->count(axis_);
-  CHECK_EQ(num_, top[0]->count());
-  if (op_ == ReductionParameter_ReductionOp_SUM ||
-      op_ == ReductionParameter_ReductionOp_MEAN) {
-    vector<int> sum_mult_shape(1, dim_);
-    sum_multiplier_.Reshape(sum_mult_shape);
-    caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
-  }
-  coeff_ = this->layer_param().reduction_param().coeff();
-  if (op_ == ReductionParameter_ReductionOp_MEAN) {
-    coeff_ /= dim_;
-  }
+	const vector<Blob<Dtype>*>& top) {
+	axis_ = bottom[0]->CanonicalAxisIndex(
+		this->layer_param_.reduction_param().axis());
+	// In the output, we'll keep all axes up to the reduction axis, but
+	// throw away any after that.
+	// Note: currently reducing along non-tail axes is not supported; otherwise,
+	// we'd need to also copy any axes following an "end_axis".
+	vector<int> top_shape(bottom[0]->shape().begin(),
+		bottom[0]->shape().begin() + axis_);
+	top[0]->Reshape(top_shape);
+	num_ = bottom[0]->count(0, axis_);
+	dim_ = bottom[0]->count(axis_);
+	CHECK_EQ(num_, top[0]->count());
+	if (op_ == ReductionParameter_ReductionOp_SUM ||
+		op_ == ReductionParameter_ReductionOp_MEAN) {
+		vector<int> sum_mult_shape(1, dim_);
+		sum_multiplier_.Reshape(sum_mult_shape);
+		caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
+	}
+	coeff_ = this->layer_param().reduction_param().coeff();
+	if (op_ == ReductionParameter_ReductionOp_MEAN) {
+		coeff_ /= dim_;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.cpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      *top_data = caffe_cpu_asum(dim_, bottom_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_cpu_data();
-    caffe_scal(num_, coeff_, top_data);
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const Dtype* mult_data = NULL;
+	if (sum_multiplier_.count() > 0) {
+		mult_data = sum_multiplier_.cpu_data();
+	}
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	for (int i = 0; i < num_; ++i) {
+		switch (op_) {
+			case ReductionParameter_ReductionOp_SUM:
+				case ReductionParameter_ReductionOp_MEAN:
+				*top_data = caffe_cpu_dot(dim_, mult_data, bottom_data);
+				break;
+			case ReductionParameter_ReductionOp_ASUM:
+				*top_data = caffe_cpu_asum(dim_, bottom_data);
+				break;
+			case ReductionParameter_ReductionOp_SUMSQ:
+				*top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data);
+				break;
+			default:
+				LOG(FATAL) << "Unknown reduction op: "
+					<< ReductionParameter_ReductionOp_Name(op_);
+		}
+		bottom_data += dim_;
+		++top_data;
+	}
+	if (coeff_ != Dtype(1)) {
+		// Reset the top_data pointer.
+		top_data = top[0]->mutable_cpu_data();
+		caffe_scal(num_, coeff_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->cpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_cpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	// Get bottom_data, if needed.
+	const Dtype* bottom_data = NULL;
+	switch (op_) {
+		// Operations that don't need bottom_data
+		case ReductionParameter_ReductionOp_SUM:
+			case ReductionParameter_ReductionOp_MEAN:
+			break;
+			// Operations that need bottom_data
+		case ReductionParameter_ReductionOp_ASUM:
+			case ReductionParameter_ReductionOp_SUMSQ:
+			bottom_data = bottom[0]->cpu_data();
+			break;
+		default:
+			LOG(FATAL) << "Unknown reduction op: "
+				<< ReductionParameter_ReductionOp_Name(op_);
+	}
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	for (int i = 0; i < num_; ++i) {
+		const Dtype bottom_coeff = (*top_diff) * coeff_;
+		switch (op_) {
+			case ReductionParameter_ReductionOp_SUM:
+				case ReductionParameter_ReductionOp_MEAN:
+				caffe_set(dim_, bottom_coeff, bottom_diff);
+				break;
+			case ReductionParameter_ReductionOp_ASUM:
+				caffe_cpu_sign(dim_, bottom_data, bottom_diff);
+				caffe_scal(dim_, bottom_coeff, bottom_diff);
+				break;
+			case ReductionParameter_ReductionOp_SUMSQ:
+				caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+				break;
+			default:
+				LOG(FATAL) << "Unknown reduction op: "
+					<< ReductionParameter_ReductionOp_Name(op_);
+		}
+		bottom_data += dim_;
+		bottom_diff += dim_;
+		++top_diff;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.gpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_gpu_data();
-    caffe_gpu_scal(num_, coeff_, top_data);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	const Dtype* mult_data = NULL;
+	if (sum_multiplier_.count() > 0) {
+		mult_data = sum_multiplier_.gpu_data();
+	}
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	for (int i = 0; i < num_; ++i) {
+		switch (op_) {
+			case ReductionParameter_ReductionOp_SUM:
+				case ReductionParameter_ReductionOp_MEAN:
+				caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
+				break;
+			case ReductionParameter_ReductionOp_ASUM:
+				caffe_gpu_asum(dim_, bottom_data, top_data);
+				break;
+			case ReductionParameter_ReductionOp_SUMSQ:
+				caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
+				break;
+			default:
+				LOG(FATAL) << "Unknown reduction op: "
+					<< ReductionParameter_ReductionOp_Name(op_);
+		}
+		bottom_data += dim_;
+		++top_data;
+	}
+	if (coeff_ != Dtype(1)) {
+		// Reset the top_data pointer.
+		top_data = top[0]->mutable_gpu_data();
+		caffe_gpu_scal(num_, coeff_, top_data);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	// Get bottom_data, if needed.
+	const Dtype* bottom_data = NULL;
+	switch (op_) {
+		// Operations that don't need bottom_data
+		case ReductionParameter_ReductionOp_SUM:
+			case ReductionParameter_ReductionOp_MEAN:
+			break;
+			// Operations that need bottom_data
+		case ReductionParameter_ReductionOp_ASUM:
+			case ReductionParameter_ReductionOp_SUMSQ:
+			bottom_data = bottom[0]->gpu_data();
+			break;
+		default:
+			LOG(FATAL) << "Unknown reduction op: "
+				<< ReductionParameter_ReductionOp_Name(op_);
+	}
+	const Dtype* top_diff = top[0]->cpu_diff();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	for (int i = 0; i < num_; ++i) {
+		const Dtype bottom_coeff = (*top_diff) * coeff_;
+		switch (op_) {
+			case ReductionParameter_ReductionOp_SUM:
+				case ReductionParameter_ReductionOp_MEAN:
+				caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
+				break;
+			case ReductionParameter_ReductionOp_ASUM:
+				caffe_gpu_sign(dim_, bottom_data, bottom_diff);
+				caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
+				break;
+			case ReductionParameter_ReductionOp_SUMSQ:
+				caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+				break;
+			default:
+				LOG(FATAL) << "Unknown reduction op: "
+					<< ReductionParameter_ReductionOp_Name(op_);
+		}
+		bottom_data += dim_;
+		bottom_diff += dim_;
+		++top_diff;
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(ReductionLayer);
 #endif
 
-INSTANTIATE_CLASS(ReductionLayer);
-REGISTER_LAYER_CLASS(Reduction);
+INSTANTIATE_CLASS (ReductionLayer);
+REGISTER_LAYER_CLASS (Reduction);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index c29d5baa..7f3b2729 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -5,67 +5,64 @@
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
-template <typename Dtype>
+template<typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = std::max(bottom_data[i], Dtype(0))
-        + negative_slope * std::min(bottom_data[i], Dtype(0));
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+	for (int i = 0; i < count; ++i) {
+		top_data[i] = std::max(bottom_data[i], Dtype(0))
+			+ negative_slope * std::min(bottom_data[i], Dtype(0));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    for (int i = 0; i < count; ++i) {
-      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-          + negative_slope * (bottom_data[i] <= 0));
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->cpu_data();
+		const Dtype* top_diff = top[0]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const int count = bottom[0]->count();
+		Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+		for (int i = 0; i < count; ++i) {
+			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
+				+ negative_slope * (bottom_data[i] <= 0));
+		}
+	}
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  ReLUForward(count,bottom_data,top_data,negative_slope);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+	ReLUForward(count, bottom_data, top_data, negative_slope);
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    ReLUBackward(count,top_diff,bottom_data,bottom_diff,negative_slope);
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* bottom_data = bottom[0]->gpu_data();
+		const Dtype* top_diff = top[0]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const int count = bottom[0]->count();
+		Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+		ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope);
+	}
 }
 
-
 #ifdef CPU_ONLY
 STUB_GPU(ReLULayer);
 #endif
 
-INSTANTIATE_CLASS(ReLULayer);
+INSTANTIATE_CLASS (ReLULayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index ffe970f2..8dbbbcb0 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -5,91 +5,92 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  inferred_axis_ = -1;
-  copy_axes_.clear();
-  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
-  const int top_num_axes = top_blob_shape.dim_size();
-  constant_count_ = 1;
-  for (int i = 0; i < top_num_axes; ++i) {
-    const int top_dim = top_blob_shape.dim(i);
-    if (top_dim == 0) {
-      copy_axes_.push_back(i);
-    } else if (top_dim == -1) {
-      CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
-          << "-1 dims; at most a single (1) value of -1 may be specified";
-      inferred_axis_ = i;
-    } else {
-      constant_count_ *= top_dim;
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	inferred_axis_ = -1;
+	copy_axes_.clear();
+	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+	const int top_num_axes = top_blob_shape.dim_size();
+	constant_count_ = 1;
+	for (int i = 0; i < top_num_axes; ++i) {
+		const int top_dim = top_blob_shape.dim(i);
+		if (top_dim == 0) {
+			copy_axes_.push_back(i);
+		} else if (top_dim == -1) {
+			CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
+				<< "-1 dims; at most a single (1) value of -1 may be specified";
+			inferred_axis_ = i;
+		} else {
+			constant_count_ *= top_dim;
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int input_start_axis = this->layer_param_.reshape_param().axis();
-  const int start_axis = (input_start_axis >= 0) ? input_start_axis :
-      bottom[0]->num_axes() + input_start_axis + 1;
-  CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
-  CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
-      << " out of range for " << bottom[0]->num_axes() << "-D input blob";
-  const int num_axes = this->layer_param_.reshape_param().num_axes();
-  CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
-  const int end_axis =
-      (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
-  CHECK_LE(end_axis, bottom[0]->num_axes())
-      << "end_axis = axis + num_axes is out of range";
-  const int num_axes_replaced = end_axis - start_axis;
-  const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
-  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
-  const int num_new_axes = top_blob_shape.dim_size();
-  vector<int> top_shape(num_axes_retained + num_new_axes);
-  int top_shape_index = 0;
-  for (int i = 0; i < start_axis; ++i) {
-    top_shape[top_shape_index++] = bottom[0]->shape(i);
-  }
-  for (int i = 0; i < num_new_axes; ++i) {
-    top_shape[top_shape_index++] = top_blob_shape.dim(i);
-  }
-  for (int i = end_axis; i < bottom[0]->num_axes(); ++i) {
-    top_shape[top_shape_index++] = bottom[0]->shape(i);
-  }
-  CHECK_EQ(top_shape_index, top_shape.size());
-  for (int i = 0; i < copy_axes_.size(); ++i) {
-    const int copy_axis_index = copy_axes_[i];
-    CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
-        << "new shape contains a 0, but there was no corresponding bottom axis "
-        << "to copy";
-    top_shape[start_axis + copy_axis_index] =
-        bottom[0]->shape(start_axis + copy_axis_index);
-  }
-  if (inferred_axis_ >= 0) {
-    // A -1 dim was specified; infer the correct dimension by computing the
-    // product of the other dimensions.
-    int explicit_count = constant_count_;
-    explicit_count *= bottom[0]->count(0, start_axis);
-    explicit_count *= bottom[0]->count(end_axis);
-    for (int i = 0; i < copy_axes_.size(); ++i) {
-      const int copy_axis_index = copy_axes_[i];
-      explicit_count *= top_shape[start_axis + copy_axis_index];
-    }
-    CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
-        << bottom[0]->count() << ") must be divisible by the product of "
-        << "the specified dimensions (" << explicit_count << ")";
-    const int inferred_dim = bottom[0]->count() / explicit_count;
-    top_shape[start_axis + inferred_axis_] = inferred_dim;
-  }
-  top[0]->Reshape(top_shape);
-  CHECK_EQ(top[0]->count(), bottom[0]->count())
-      << "output count must match input count";
-  top[0]->ShareData(*bottom[0]);
-  top[0]->ShareDiff(*bottom[0]);
+	const vector<Blob<Dtype>*>& top) {
+	const int input_start_axis = this->layer_param_.reshape_param().axis();
+	const int start_axis =
+		(input_start_axis >= 0) ? input_start_axis :
+															bottom[0]->num_axes() + input_start_axis + 1;
+	CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
+	CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
+		<< " out of range for " << bottom[0]->num_axes() << "-D input blob";
+	const int num_axes = this->layer_param_.reshape_param().num_axes();
+	CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
+	const int end_axis =
+		(num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
+	CHECK_LE(end_axis, bottom[0]->num_axes())
+		<< "end_axis = axis + num_axes is out of range";
+	const int num_axes_replaced = end_axis - start_axis;
+	const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
+	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+	const int num_new_axes = top_blob_shape.dim_size();
+	vector<int> top_shape(num_axes_retained + num_new_axes);
+	int top_shape_index = 0;
+	for (int i = 0; i < start_axis; ++i) {
+		top_shape[top_shape_index++] = bottom[0]->shape(i);
+	}
+	for (int i = 0; i < num_new_axes; ++i) {
+		top_shape[top_shape_index++] = top_blob_shape.dim(i);
+	}
+	for (int i = end_axis; i < bottom[0]->num_axes(); ++i) {
+		top_shape[top_shape_index++] = bottom[0]->shape(i);
+	}
+	CHECK_EQ(top_shape_index, top_shape.size());
+	for (int i = 0; i < copy_axes_.size(); ++i) {
+		const int copy_axis_index = copy_axes_[i];
+		CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
+			<< "new shape contains a 0, but there was no corresponding bottom axis "
+			<< "to copy";
+		top_shape[start_axis + copy_axis_index] =
+			bottom[0]->shape(start_axis + copy_axis_index);
+	}
+	if (inferred_axis_ >= 0) {
+		// A -1 dim was specified; infer the correct dimension by computing the
+		// product of the other dimensions.
+		int explicit_count = constant_count_;
+		explicit_count *= bottom[0]->count(0, start_axis);
+		explicit_count *= bottom[0]->count(end_axis);
+		for (int i = 0; i < copy_axes_.size(); ++i) {
+			const int copy_axis_index = copy_axes_[i];
+			explicit_count *= top_shape[start_axis + copy_axis_index];
+		}
+		CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
+			<< bottom[0]->count() << ") must be divisible by the product of "
+			<< "the specified dimensions (" << explicit_count << ")";
+		const int inferred_dim = bottom[0]->count() / explicit_count;
+		top_shape[start_axis + inferred_axis_] = inferred_dim;
+	}
+	top[0]->Reshape(top_shape);
+	CHECK_EQ(top[0]->count(), bottom[0]->count())
+		<< "output count must match input count";
+	top[0]->ShareData(*bottom[0]);
+	top[0]->ShareDiff(*bottom[0]);
 }
 
-INSTANTIATE_CLASS(ReshapeLayer);
-REGISTER_LAYER_CLASS(Reshape);
+INSTANTIATE_CLASS (ReshapeLayer);
+REGISTER_LAYER_CLASS (Reshape);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 1c22fe19..a5be48e7 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -8,95 +8,96 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
-  sigmoid_bottom_vec_.clear();
-  sigmoid_bottom_vec_.push_back(bottom[0]);
-  sigmoid_top_vec_.clear();
-  sigmoid_top_vec_.push_back(sigmoid_output_.get());
-  sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::LayerSetUp(bottom, top);
+	sigmoid_bottom_vec_.clear();
+	sigmoid_bottom_vec_.push_back(bottom[0]);
+	sigmoid_top_vec_.clear();
+	sigmoid_top_vec_.push_back(sigmoid_output_.get());
+	sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
-      "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
-  sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::Reshape(bottom, top);
+	CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
+		"SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
+	sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // The forward pass computes the sigmoid outputs.
-  sigmoid_bottom_vec_[0] = bottom[0];
-  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
-  // Compute the loss (negative log likelihood)
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
-  // Stable version of loss computation from input data
-  const Dtype* input_data = bottom[0]->cpu_data();
-  const Dtype* target = bottom[1]->cpu_data();
-  Dtype loss = 0;
-  for (int i = 0; i < count; ++i) {
-    loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
-  }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// The forward pass computes the sigmoid outputs.
+	sigmoid_bottom_vec_[0] = bottom[0];
+	sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+	// Compute the loss (negative log likelihood)
+	const int count = bottom[0]->count();
+	const int num = bottom[0]->num();
+	// Stable version of loss computation from input data
+	const Dtype* input_data = bottom[0]->cpu_data();
+	const Dtype* target = bottom[1]->cpu_data();
+	Dtype loss = 0;
+	for (int i = 0; i < count; ++i) {
+		loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
+			log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+	}
+	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
-    const Dtype* target = bottom[1]->cpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    caffe_sub(count, sigmoid_output_data, target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_scal(count, loss_weight / num, bottom_diff);
-  }
+	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		// First, compute the diff
+		const int count = bottom[0]->count();
+		const int num = bottom[0]->num();
+		const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
+		const Dtype* target = bottom[1]->cpu_data();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		caffe_sub(count, sigmoid_output_data, target, bottom_diff);
+		// Scale down gradient
+		const Dtype loss_weight = top[0]->cpu_diff()[0];
+		caffe_scal(count, loss_weight / num, bottom_diff);
+	}
 }
 
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = bottom[1]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-  }
+template<typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
+	const vector<Blob<Dtype>*>& top,
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		// First, compute the diff
+		const int count = bottom[0]->count();
+		const int num = bottom[0]->num();
+		const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+		const Dtype* target = bottom[1]->gpu_data();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
+		caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+		// Scale down gradient
+		const Dtype loss_weight = top[0]->cpu_diff()[0];
+		caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
 #endif
 
-INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
-REGISTER_LAYER_CLASS(SigmoidCrossEntropyLoss);
+INSTANTIATE_CLASS (SigmoidCrossEntropyLossLayer);
+REGISTER_LAYER_CLASS (SigmoidCrossEntropyLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index fa13a4c1..4095ccdb 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -8,66 +8,65 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+	return 1. / (1. + exp(-x));
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = sigmoid(bottom_data[i]);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	for (int i = 0; i < count; ++i) {
+		top_data[i] = sigmoid(bottom_data[i]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const int count = bottom[0]->count();
-    for (int i = 0; i < count; ++i) {
-      const Dtype sigmoid_x = top_data[i];
-      bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_data = top[0]->cpu_data();
+		const Dtype* top_diff = top[0]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const int count = bottom[0]->count();
+		for (int i = 0; i < count; ++i) {
+			const Dtype sigmoid_x = top_data[i];
+			bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SigmoidForward(count, bottom_data, top_data);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	SigmoidForward(count, bottom_data, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidBackward(count, top_diff, top_data, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_data = top[0]->gpu_data();
+		const Dtype* top_diff = top[0]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const int count = bottom[0]->count();
+		// NOLINT_NEXT_LINE(whitespace/operators)
+		SigmoidBackward(count, top_diff, top_data, bottom_diff);
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(SigmoidLayer);
 #endif
 
-INSTANTIATE_CLASS(SigmoidLayer);
-
+INSTANTIATE_CLASS (SigmoidLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index e36a5cad..05929a70 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -6,39 +6,39 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_set(bottom[i]->count(), Dtype(0),
-                bottom[i]->mutable_cpu_data());
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	for (int i = 0; i < bottom.size(); ++i) {
+		if (propagate_down[i]) {
+			caffe_set(bottom[i]->count(), Dtype(0),
+				bottom[i]->mutable_cpu_data());
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Do nothing.
+	const vector<Blob<Dtype>*>& top) {
+	// Do nothing.
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_data());
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	for (int i = 0; i < bottom.size(); ++i) {
+		if (propagate_down[i]) {
+			caffe_gpu_set(bottom[i]->count(), Dtype(0),
+				bottom[i]->mutable_gpu_data());
+		}
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(SilenceLayer);
 #endif
 
-INSTANTIATE_CLASS(SilenceLayer);
-REGISTER_LAYER_CLASS(Silence);
+INSTANTIATE_CLASS (SilenceLayer);
+REGISTER_LAYER_CLASS (Silence);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 76021faa..7b327527 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -7,124 +7,126 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const SliceParameter& slice_param = this->layer_param_.slice_param();
-  CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
-      << "Either axis or slice_dim should be specified; not both.";
-  slice_point_.clear();
-  std::copy(slice_param.slice_point().begin(),
-      slice_param.slice_point().end(),
-      std::back_inserter(slice_point_));
+	const vector<Blob<Dtype>*>& top) {
+	const SliceParameter& slice_param = this->layer_param_.slice_param();
+	CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
+		<< "Either axis or slice_dim should be specified; not both.";
+	slice_point_.clear();
+	std::copy(slice_param.slice_point().begin(),
+		slice_param.slice_point().end(),
+		std::back_inserter(slice_point_));
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int num_axes = bottom[0]->num_axes();
-  const SliceParameter& slice_param = this->layer_param_.slice_param();
-  if (slice_param.has_slice_dim()) {
-    slice_axis_ = static_cast<int>(slice_param.slice_dim());
-    // Don't allow negative indexing for slice_dim, a uint32 -- almost
-    // certainly unintended.
-    CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
-        << "produced negative result; slice_dim must satisfy "
-        << "0 <= slice_dim < " << kMaxBlobAxes;
-    CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
-  } else {
-    slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
-  }
-  vector<int> top_shape = bottom[0]->shape();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  num_slices_ = bottom[0]->count(0, slice_axis_);
-  slice_size_ = bottom[0]->count(slice_axis_ + 1);
-  int count = 0;
-  if (slice_point_.size() != 0) {
-    CHECK_EQ(slice_point_.size(), top.size() - 1);
-    CHECK_LE(top.size(), bottom_slice_axis);
-    int prev = 0;
-    vector<int> slices;
-    for (int i = 0; i < slice_point_.size(); ++i) {
-      CHECK_GT(slice_point_[i], prev);
-      slices.push_back(slice_point_[i] - prev);
-      prev = slice_point_[i];
-    }
-    slices.push_back(bottom_slice_axis - prev);
-    for (int i = 0; i < top.size(); ++i) {
-      top_shape[slice_axis_] = slices[i];
-      top[i]->Reshape(top_shape);
-      count += top[i]->count();
-    }
-  } else {
-    CHECK_EQ(bottom_slice_axis % top.size(), 0)
-        << "Number of top blobs (" << top.size() << ") should evenly "
-        << "divide input slice axis (" << bottom_slice_axis << ")";
-    top_shape[slice_axis_] = bottom_slice_axis / top.size();
-    for (int i = 0; i < top.size(); ++i) {
-      top[i]->Reshape(top_shape);
-      count += top[i]->count();
-    }
-  }
-  CHECK_EQ(count, bottom[0]->count());
+	const vector<Blob<Dtype>*>& top) {
+	const int num_axes = bottom[0]->num_axes();
+	const SliceParameter& slice_param = this->layer_param_.slice_param();
+	if (slice_param.has_slice_dim()) {
+		slice_axis_ = static_cast<int>(slice_param.slice_dim());
+		// Don't allow negative indexing for slice_dim, a uint32 -- almost
+		// certainly unintended.
+		CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
+			<< "produced negative result; slice_dim must satisfy "
+			<< "0 <= slice_dim < " << kMaxBlobAxes;
+		CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
+	} else {
+		slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
+	}
+	vector<int> top_shape = bottom[0]->shape();
+	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+	num_slices_ = bottom[0]->count(0, slice_axis_);
+	slice_size_ = bottom[0]->count(slice_axis_ + 1);
+	int count = 0;
+	if (slice_point_.size() != 0) {
+		CHECK_EQ(slice_point_.size(), top.size() - 1);
+		CHECK_LE(top.size(), bottom_slice_axis);
+		int prev = 0;
+		vector<int> slices;
+		for (int i = 0; i < slice_point_.size(); ++i) {
+			CHECK_GT(slice_point_[i], prev);
+			slices.push_back(slice_point_[i] - prev);
+			prev = slice_point_[i];
+		}
+		slices.push_back(bottom_slice_axis - prev);
+		for (int i = 0; i < top.size(); ++i) {
+			top_shape[slice_axis_] = slices[i];
+			top[i]->Reshape(top_shape);
+			count += top[i]->count();
+		}
+	} else {
+		CHECK_EQ(bottom_slice_axis % top.size(), 0)
+			<< "Number of top blobs (" << top.size() << ") should evenly "
+			<< "divide input slice axis (" << bottom_slice_axis << ")";
+		top_shape[slice_axis_] = bottom_slice_axis / top.size();
+		for (int i = 0; i < top.size(); ++i) {
+			top[i]->Reshape(top_shape);
+			count += top[i]->count();
+		}
+	}
+	CHECK_EQ(count, bottom[0]->count());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int offset_slice_axis = 0;
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  for (int i = 0; i < top.size(); ++i) {
-    Dtype* top_data = top[i]->mutable_cpu_data();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    for (int n = 0; n < num_slices_; ++n) {
-      const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          bottom_data + bottom_offset, top_data + top_offset);
-    }
-    offset_slice_axis += top_slice_axis;
-  }
+	const vector<Blob<Dtype>*>& top) {
+	int offset_slice_axis = 0;
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+	for (int i = 0; i < top.size(); ++i) {
+		Dtype* top_data = top[i]->mutable_cpu_data();
+		const int top_slice_axis = top[i]->shape(slice_axis_);
+		for (int n = 0; n < num_slices_; ++n) {
+			const int top_offset = n * top_slice_axis * slice_size_;
+			const int bottom_offset =
+				(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+			caffe_copy(top_slice_axis * slice_size_,
+				bottom_data + bottom_offset, top_data + top_offset);
+		}
+		offset_slice_axis += top_slice_axis;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  int offset_slice_axis = 0;
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->cpu_diff();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    for (int n = 0; n < num_slices_; ++n) {
-      const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          top_diff + top_offset, bottom_diff + bottom_offset);
-    }
-    offset_slice_axis += top_slice_axis;
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	int offset_slice_axis = 0;
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+	for (int i = 0; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->cpu_diff();
+		const int top_slice_axis = top[i]->shape(slice_axis_);
+		for (int n = 0; n < num_slices_; ++n) {
+			const int top_offset = n * top_slice_axis * slice_size_;
+			const int bottom_offset =
+				(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+			caffe_copy(top_slice_axis * slice_size_,
+				top_diff + top_offset, bottom_diff + bottom_offset);
+		}
+		offset_slice_axis += top_slice_axis;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
+	const vector<Blob<Dtype>*>& top) {
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(SliceLayer);
 #endif
 
-INSTANTIATE_CLASS(SliceLayer);
-REGISTER_LAYER_CLASS(Slice);
+INSTANTIATE_CLASS (SliceLayer);
+REGISTER_LAYER_CLASS (Slice);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 27c18b7b..117a966f 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -1,4 +1,4 @@
-#include <algorithm>
+s#include <algorithm>
 #include <vector>
 
 #include "caffe/layer.hpp"
@@ -7,152 +7,151 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  softmax_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
-  top[0]->ReshapeLike(*bottom[0]);
-  vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
-  sum_multiplier_.Reshape(mult_dims);
-  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
-  outer_num_ = bottom[0]->count(0, softmax_axis_);
-  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
-  vector<int> scale_dims = bottom[0]->shape();
-  scale_dims[softmax_axis_] = 1;
-  scale_.Reshape(scale_dims);
+	const vector<Blob<Dtype>*>& top) {
+	softmax_axis_ =
+		bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+	top[0]->ReshapeLike(*bottom[0]);
+	vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
+	sum_multiplier_.Reshape(mult_dims);
+	Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+	caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
+	outer_num_ = bottom[0]->count(0, softmax_axis_);
+	inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+	vector<int> scale_dims = bottom[0]->shape();
+	scale_dims[softmax_axis_] = 1;
+	scale_.Reshape(scale_dims);
 }
 
-
-template <typename Dtype>
-SoftmaxLayer<Dtype>::~SoftmaxLayer(){
+template<typename Dtype>
+SoftmaxLayer<Dtype>::~SoftmaxLayer() {
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  Dtype* scale_data = scale_.mutable_cpu_data();
-  int channels = bottom[0]->shape(softmax_axis_);
-  int dim = bottom[0]->count() / outer_num_;
-  caffe_copy(bottom[0]->count(), bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  for (int i = 0; i < outer_num_; ++i) {
-    // initialize scale_data to the first plane
-    caffe_copy(inner_num_, bottom_data + i * dim, scale_data);
-    for (int j = 0; j < channels; j++) {
-      for (int k = 0; k < inner_num_; k++) {
-        scale_data[k] = std::max(scale_data[k],
-            bottom_data[i * dim + j * inner_num_ + k]);
-      }
-    }
-    // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_,
-        1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
-    // exponentiation
-    caffe_exp<Dtype>(dim, top_data, top_data);
-    // sum after exp
-    caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,
-        top_data, sum_multiplier_.cpu_data(), 0., scale_data);
-    // division
-    for (int j = 0; j < channels; j++) {
-      caffe_div(inner_num_, top_data, scale_data, top_data);
-      top_data += inner_num_;
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	Dtype* scale_data = scale_.mutable_cpu_data();
+	int channels = bottom[0]->shape(softmax_axis_);
+	int dim = bottom[0]->count() / outer_num_;
+	caffe_copy(bottom[0]->count(), bottom_data, top_data);
+	// We need to subtract the max to avoid numerical issues, compute the exp,
+	// and then normalize.
+	for (int i = 0; i < outer_num_; ++i) {
+		// initialize scale_data to the first plane
+		caffe_copy(inner_num_, bottom_data + i * dim, scale_data);
+		for (int j = 0; j < channels; j++) {
+			for (int k = 0; k < inner_num_; k++) {
+				scale_data[k] = std::max(scale_data[k],
+					bottom_data[i * dim + j * inner_num_ + k]);
+			}
+		}
+		// subtraction
+		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_,
+			1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
+		// exponentiation
+		caffe_exp < Dtype > (dim, top_data, top_data);
+		// sum after exp
+		caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1.,
+			top_data, sum_multiplier_.cpu_data(), 0., scale_data);
+		// division
+		for (int j = 0; j < channels; j++) {
+			caffe_div(inner_num_, top_data, scale_data, top_data);
+			top_data += inner_num_;
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->cpu_diff();
-  const Dtype* top_data = top[0]->cpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  Dtype* scale_data = scale_.mutable_cpu_data();
-  int channels = top[0]->shape(softmax_axis_);
-  int dim = top[0]->count() / outer_num_;
-  caffe_copy(top[0]->count(), top_diff, bottom_diff);
-  for (int i = 0; i < outer_num_; ++i) {
-    // compute dot(top_diff, top_data) and subtract them from the bottom diff
-    for (int k = 0; k < inner_num_; ++k) {
-      scale_data[k] = caffe_cpu_strided_dot<Dtype>(channels,
-          bottom_diff + i * dim + k, inner_num_,
-          top_data + i * dim + k, inner_num_);
-    }
-    // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
-        -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
-  }
-  // elementwise multiplication
-  caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->cpu_diff();
+	const Dtype* top_data = top[0]->cpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+	Dtype* scale_data = scale_.mutable_cpu_data();
+	int channels = top[0]->shape(softmax_axis_);
+	int dim = top[0]->count() / outer_num_;
+	caffe_copy(top[0]->count(), top_diff, bottom_diff);
+	for (int i = 0; i < outer_num_; ++i) {
+		// compute dot(top_diff, top_data) and subtract them from the bottom diff
+		for (int k = 0; k < inner_num_; ++k) {
+			scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels,
+				bottom_diff + i * dim + k, inner_num_,
+				top_data + i * dim + k, inner_num_);
+		}
+		// subtraction
+		caffe_cpu_gemm < Dtype
+			> (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
+				-1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
+	}
+	// elementwise multiplication
+	caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = bottom[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	Dtype* scale_data = scale_.mutable_gpu_data();
+	int count = bottom[0]->count();
+	int channels = top[0]->shape(softmax_axis_);
+
+	caffe_gpu_copy(count, bottom_data, top_data);
+	// We need to subtract the max to avoid numerical issues, compute the exp,
+	// and then normalize.
+	// compute max
+	// NOLINT_NEXT_LINE(whitespace/operators)
 
-  caffe_gpu_copy(count, bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  // compute max
-  // NOLINT_NEXT_LINE(whitespace/operators)
- 
-  kernel_channel_max<Dtype>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // subtract
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-  // exponentiate
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype>(count, top_data, top_data);
-  // sum after exp
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // divide
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
+	kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data,
+		scale_data);
+	// subtract
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
+		scale_data, top_data);
+	// exponentiate
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_exp < Dtype > (count, top_data, top_data);
+	// sum after exp
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data,
+		scale_data);
+	// divide
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_,
+		scale_data, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = top[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_gpu_copy(count, top_diff, bottom_diff);
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-  // NOLINT_NEXT_LINE(whitespace/operators)
- 
-  kernel_channel_dot<Dtype>(outer_num_, channels, inner_num_,
-      top_diff, top_data, scale_data);
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype>(count, outer_num_, channels, inner_num_,
-      scale_data, bottom_diff);
-  // elementwise multiplication
-  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	const Dtype* top_diff = top[0]->gpu_diff();
+	const Dtype* top_data = top[0]->gpu_data();
+	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+	Dtype* scale_data = scale_.mutable_gpu_data();
+	int count = top[0]->count();
+	int channels = top[0]->shape(softmax_axis_);
+	caffe_gpu_copy(count, top_diff, bottom_diff);
+	// Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
+	// NOLINT_NEXT_LINE(whitespace/operators)
 
-}
+	kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_,
+		top_diff, top_data, scale_data);
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
+		scale_data, bottom_diff);
+	// elementwise multiplication
+	caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
+}
 
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxLayer);
 #endif
 
-INSTANTIATE_CLASS(SoftmaxLayer);
+INSTANTIATE_CLASS (SoftmaxLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 66ac9ea5..b998c2f6 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -9,201 +9,202 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
-  LayerParameter softmax_param(this->layer_param_);
-  softmax_param.set_type("Softmax");
-  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_param);
-  softmax_bottom_vec_.clear();
-  softmax_bottom_vec_.push_back(bottom[0]);
-  softmax_top_vec_.clear();
-  softmax_top_vec_.push_back(&prob_);
-  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
-
-  has_ignore_label_ =
-    this->layer_param_.loss_param().has_ignore_label();
-  if (has_ignore_label_) {
-    ignore_label_ = this->layer_param_.loss_param().ignore_label();
-  }
-  normalize_ = this->layer_param_.loss_param().normalize();
-
-  ocl_setup();
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::LayerSetUp(bottom, top);
+	LayerParameter softmax_param(this->layer_param_);
+	softmax_param.set_type("Softmax");
+	softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param);
+	softmax_bottom_vec_.clear();
+	softmax_bottom_vec_.push_back(bottom[0]);
+	softmax_top_vec_.clear();
+	softmax_top_vec_.push_back(&prob_);
+	softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+	has_ignore_label_ =
+		this->layer_param_.loss_param().has_ignore_label();
+	if (has_ignore_label_) {
+		ignore_label_ = this->layer_param_.loss_param().ignore_label();
+	}
+	normalize_ = this->layer_param_.loss_param().normalize();
+
+	ocl_setup();
 }
 
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::ocl_setup(){
-   d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, sizeof(Dtype), NULL, NULL);
+template<typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::ocl_setup() {
+	d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+		sizeof(Dtype), NULL, NULL);
 
 }
 
-template <typename Dtype>
-SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer(){
+template<typename Dtype>
+SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer() {
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
-  softmax_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
-  outer_num_ = bottom[0]->count(0, softmax_axis_);
-  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
-  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-      << "Number of labels must match number of predictions; "
-      << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
-      << "label count (number of labels) must be N*H*W, "
-      << "with integer values in {0, 1, ..., C-1}.";
-  if (top.size() >= 2) {
-    // softmax output
-    top[1]->ReshapeLike(*bottom[0]);
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	LossLayer < Dtype > ::Reshape(bottom, top);
+	softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+	softmax_axis_ =
+		bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+	outer_num_ = bottom[0]->count(0, softmax_axis_);
+	inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+		<< "Number of labels must match number of predictions; "
+		<< "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
+		<< "label count (number of labels) must be N*H*W, "
+		<< "with integer values in {0, 1, ..., C-1}.";
+	if (top.size() >= 2) {
+		// softmax output
+		top[1]->ReshapeLike(*bottom[0]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // The forward pass computes the softmax prob values.
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.cpu_data();
-  const Dtype* label = bottom[1]->cpu_data();
-  int dim = prob_.count() / outer_num_;
-  int count = 0;
-  Dtype loss = 0;
-  for (int i = 0; i < outer_num_; ++i) {
-    for (int j = 0; j < inner_num_; j++) {
-      const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-      if (has_ignore_label_ && label_value == ignore_label_) {
-        continue;
-      }
-      DCHECK_GE(label_value, 0);
-      DCHECK_LT(label_value, prob_.shape(softmax_axis_));
-      loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
-                           Dtype(FLT_MIN)));
-      ++count;
-    }
-  }
-  if (normalize_) {
-    top[0]->mutable_cpu_data()[0] = loss / count;
-  } else {
-    top[0]->mutable_cpu_data()[0] = loss / outer_num_;
-  }
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// The forward pass computes the softmax prob values.
+	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+	const Dtype* prob_data = prob_.cpu_data();
+	const Dtype* label = bottom[1]->cpu_data();
+	int dim = prob_.count() / outer_num_;
+	int count = 0;
+	Dtype loss = 0;
+	for (int i = 0; i < outer_num_; ++i) {
+		for (int j = 0; j < inner_num_; j++) {
+			const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+			if (has_ignore_label_ && label_value == ignore_label_) {
+				continue;
+			}
+			DCHECK_GE(label_value, 0);
+			DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+			loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
+				Dtype(FLT_MIN)));
+			++count;
+		}
+	}
+	if (normalize_) {
+		top[0]->mutable_cpu_data()[0] = loss / count;
+	} else {
+		top[0]->mutable_cpu_data()[0] = loss / outer_num_;
+	}
+	if (top.size() == 2) {
+		top[1]->ShareData(prob_);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const Dtype* prob_data = prob_.cpu_data();
-    caffe_copy(prob_.count(), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->cpu_data();
-    int dim = prob_.count() / outer_num_;
-    int count = 0;
-    for (int i = 0; i < outer_num_; ++i) {
-      for (int j = 0; j < inner_num_; ++j) {
-        const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-        if (has_ignore_label_ && label_value == ignore_label_) {
-          for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
-            bottom_diff[i * dim + c * inner_num_ + j] = 0;
-          }
-        } else {
-          bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
-          ++count;
-        }
-      }
-    }
-    // Scale gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const Dtype* prob_data = prob_.cpu_data();
+		caffe_copy(prob_.count(), prob_data, bottom_diff);
+		const Dtype* label = bottom[1]->cpu_data();
+		int dim = prob_.count() / outer_num_;
+		int count = 0;
+		for (int i = 0; i < outer_num_; ++i) {
+			for (int j = 0; j < inner_num_; ++j) {
+				const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+				if (has_ignore_label_ && label_value == ignore_label_) {
+					for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+						bottom_diff[i * dim + c * inner_num_ + j] = 0;
+					}
+				} else {
+					bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
+					++count;
+				}
+			}
+		}
+		// Scale gradient
+		const Dtype loss_weight = top[0]->cpu_diff()[0];
+		if (normalize_) {
+			caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
+		} else {
+			caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.gpu_data();
-  const Dtype* label = bottom[1]->gpu_data();
-  const int dim = prob_.count() / outer_num_;
-  const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-  // Similarly, this memory is never used elsewhere, and thus we can use it
-  // to avoid having to allocate additional GPU memory.
-  Dtype* counts = prob_.mutable_gpu_diff();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype>( nthreads, prob_data, label, loss_data,
-       outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Dtype loss;
-  caffe_gpu_asum(nthreads, loss_data, &loss);
-  if (normalize_) {
-    Dtype count;
-    caffe_gpu_asum(nthreads, counts, &count);
-    loss /= count;
-  } else {
-    loss /= outer_num_;
-  }
-  printf("loss = %f\n", loss);
-  top[0]->mutable_cpu_data()[0] = loss;
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
+	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+	const Dtype* prob_data = prob_.gpu_data();
+	const Dtype* label = bottom[1]->gpu_data();
+	const int dim = prob_.count() / outer_num_;
+	const int nthreads = outer_num_ * inner_num_;
+	// Since this memory is not used for anything until it is overwritten
+	// on the backward pass, we use it here to avoid having to allocate new GPU
+	// memory to accumulate intermediate results in the kernel.
+	Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+	// Similarly, this memory is never used elsewhere, and thus we can use it
+	// to avoid having to allocate additional GPU memory.
+	Dtype* counts = prob_.mutable_gpu_diff();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data,
+		outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+	Dtype loss;
+	caffe_gpu_asum(nthreads, loss_data, &loss);
+	if (normalize_) {
+		Dtype count;
+		caffe_gpu_asum(nthreads, counts, &count);
+		loss /= count;
+	} else {
+		loss /= outer_num_;
+	}
+	printf("loss = %f\n", loss);
+	top[0]->mutable_cpu_data()[0] = loss;
+	if (top.size() == 2) {
+		top[1]->ShareData(prob_);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* prob_data = prob_.gpu_data();
-    const Dtype* top_data = top[0]->gpu_data();
-    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->gpu_data();
-    const int dim = prob_.count() / outer_num_;
-    const int nthreads = outer_num_ * inner_num_;
-    // Since this memory is never used for anything else,
-    // we use to to avoid allocating new GPU memory.
-    Dtype* counts = prob_.mutable_gpu_diff();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype>(nthreads, top_data, label, bottom_diff,
-           outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      Dtype count;
-      caffe_gpu_asum(nthreads, counts, &count);
-      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[1]) {
+		LOG(FATAL) << this->type()
+			<< " Layer cannot backpropagate to label inputs.";
+	}
+	if (propagate_down[0]) {
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const Dtype* prob_data = prob_.gpu_data();
+		const Dtype* top_data = top[0]->gpu_data();
+		caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+		//caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
+		const Dtype* label = bottom[1]->gpu_data();
+		const int dim = prob_.count() / outer_num_;
+		const int nthreads = outer_num_ * inner_num_;
+		// Since this memory is never used for anything else,
+		// we use to to avoid allocating new GPU memory.
+		Dtype* counts = prob_.mutable_gpu_diff();
+		// NOLINT_NEXT_LINE(whitespace/operators)
+		SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff,
+			outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+		const Dtype loss_weight = top[0]->cpu_diff()[0];
+		if (normalize_) {
+			Dtype count;
+			caffe_gpu_asum(nthreads, counts, &count);
+			caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
+		} else {
+			caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+		}
+	}
 }
 
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
 
-INSTANTIATE_CLASS(SoftmaxWithLossLayer);
-REGISTER_LAYER_CLASS(SoftmaxWithLoss);
+INSTANTIATE_CLASS (SoftmaxWithLossLayer);
+REGISTER_LAYER_CLASS (SoftmaxWithLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 4b60db10..0ad8179a 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -6,74 +6,79 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  count_ = bottom[0]->count();
-  for (int i = 0; i < top.size(); ++i) {
-    // Do not allow in-place computation in the SplitLayer.  Instead, share data
-    // by reference in the forward pass, and keep separate diff allocations in
-    // the backward pass.  (Technically, it should be possible to share the diff
-    // blob of the first split output with the input, but this seems to cause
-    // some strange effects in practice...)
-    CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
-        "allow in-place computation.";
-    top[i]->ReshapeLike(*bottom[0]);
-    CHECK_EQ(count_, top[i]->count());
-  }
-  gpu_add_kernel = clCreateKernel(amdDevice.Program,"caffe_gpu_add_float",NULL);
+	const vector<Blob<Dtype>*>& top) {
+	count_ = bottom[0]->count();
+	for (int i = 0; i < top.size(); ++i) {
+		// Do not allow in-place computation in the SplitLayer.  Instead, share data
+		// by reference in the forward pass, and keep separate diff allocations in
+		// the backward pass.  (Technically, it should be possible to share the diff
+		// blob of the first split output with the input, but this seems to cause
+		// some strange effects in practice...)
+		CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
+			"allow in-place computation.";
+		top[i]->ReshapeLike(*bottom[0]);
+		CHECK_EQ(count_, top[i]->count());
+	}
+	gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float",
+		NULL);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	for (int i = 0; i < top.size(); ++i) {
+		top[i]->ShareData(*bottom[0]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff());
-    return;
-  }
-  caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
-            bottom[0]->mutable_cpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	if (top.size() == 1) {
+		caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff());
+		return;
+	}
+	caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
+		bottom[0]->mutable_cpu_diff());
+	// Add remaining top blob diffs.
+	for (int i = 2; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	for (int i = 0; i < top.size(); ++i) {
+		top[i]->ShareData(*bottom[0]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-    return;
-  }
-  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-                bottom[0]->mutable_gpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	if (top.size() == 1) {
+		caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
+		return;
+	}
+	caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
+		bottom[0]->mutable_gpu_diff());
+	// Add remaining top blob diffs.
+	for (int i = 2; i < top.size(); ++i) {
+		const Dtype* top_diff = top[i]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+	}
 
 }
 
@@ -81,7 +86,7 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 STUB_GPU(SplitLayer);
 #endif
 
-INSTANTIATE_CLASS(SplitLayer);
-REGISTER_LAYER_CLASS(Split);
+INSTANTIATE_CLASS (SplitLayer);
+REGISTER_LAYER_CLASS (Split);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
index 795dd716..bfc7778c 100644
--- a/src/caffe/layers/spp_layer.cpp
+++ b/src/caffe/layers/spp_layer.cpp
@@ -13,181 +13,180 @@ namespace caffe {
 using std::min;
 using std::max;
 
-template <typename Dtype>
+template<typename Dtype>
 LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
-      const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
-  LayerParameter pooling_param;
-  int num_bins = pow(2, pyramid_level);
-
-  // find padding and kernel size so that the pooling is
-  // performed across the entire image
-  int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
-  // remainder_h is the min number of pixels that need to be padded before
-  // entire image height is pooled over with the chosen kernel dimension
-  int remainder_h = kernel_h * num_bins - bottom_h;
-  // pooling layer pads (2 * pad_h) pixels on the top and bottom of the
-  // image.
-  int pad_h = (remainder_h + 1) / 2;
-
-  // similar logic for width
-  int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
-  int remainder_w = kernel_w * num_bins - bottom_w;
-  int pad_w = (remainder_w + 1) / 2;
-
-  pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
-  pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
-  pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
-  pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
-  pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
-  pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
-
-  switch (spp_param.pool()) {
-  case SPPParameter_PoolMethod_MAX:
-    pooling_param.mutable_pooling_param()->set_pool(
-        PoolingParameter_PoolMethod_MAX);
-    break;
-  case SPPParameter_PoolMethod_AVE:
-    pooling_param.mutable_pooling_param()->set_pool(
-        PoolingParameter_PoolMethod_AVE);
-    break;
-  case SPPParameter_PoolMethod_STOCHASTIC:
-    pooling_param.mutable_pooling_param()->set_pool(
-        PoolingParameter_PoolMethod_STOCHASTIC);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-
-  return pooling_param;
+	const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+	LayerParameter pooling_param;
+	int num_bins = pow(2, pyramid_level);
+
+	// find padding and kernel size so that the pooling is
+	// performed across the entire image
+	int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
+	// remainder_h is the min number of pixels that need to be padded before
+	// entire image height is pooled over with the chosen kernel dimension
+	int remainder_h = kernel_h * num_bins - bottom_h;
+	// pooling layer pads (2 * pad_h) pixels on the top and bottom of the
+	// image.
+	int pad_h = (remainder_h + 1) / 2;
+
+	// similar logic for width
+	int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
+	int remainder_w = kernel_w * num_bins - bottom_w;
+	int pad_w = (remainder_w + 1) / 2;
+
+	pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
+	pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
+	pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
+	pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
+	pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
+	pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
+
+	switch (spp_param.pool()) {
+		case SPPParameter_PoolMethod_MAX:
+			pooling_param.mutable_pooling_param()->set_pool(
+				PoolingParameter_PoolMethod_MAX);
+			break;
+		case SPPParameter_PoolMethod_AVE:
+			pooling_param.mutable_pooling_param()->set_pool(
+				PoolingParameter_PoolMethod_AVE);
+			break;
+		case SPPParameter_PoolMethod_STOCHASTIC:
+			pooling_param.mutable_pooling_param()->set_pool(
+				PoolingParameter_PoolMethod_STOCHASTIC);
+			break;
+		default:
+			LOG(FATAL) << "Unknown pooling method.";
+	}
+
+	return pooling_param;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SPPParameter spp_param = this->layer_param_.spp_param();
-
-  bottom_h_ = bottom[0]->height();
-  bottom_w_ = bottom[0]->width();
-  CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
-  CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
-
-  pyramid_height_ = spp_param.pyramid_height();
-  split_top_vec_.clear();
-  pooling_bottom_vecs_.clear();
-  pooling_layers_.clear();
-  pooling_top_vecs_.clear();
-  pooling_outputs_.clear();
-  flatten_layers_.clear();
-  flatten_top_vecs_.clear();
-  flatten_outputs_.clear();
-  concat_bottom_vec_.clear();
-
-  // split layer output holders setup
-  for (int i = 0; i < pyramid_height_; i++) {
-    split_top_vec_.push_back(new Blob<Dtype>());
-  }
-
-  // split layer setup
-  LayerParameter split_param;
-  split_layer_.reset(new SplitLayer<Dtype>(split_param));
-  split_layer_->SetUp(bottom, split_top_vec_);
-
-  for (int i = 0; i < pyramid_height_; i++) {
-    // pooling layer input holders setup
-    pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
-    pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
-
-    // pooling layer output holders setup
-    pooling_outputs_.push_back(new Blob<Dtype>());
-    pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
-    pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
-
-    // pooling layer setup
-    LayerParameter pooling_param = GetPoolingParam(
-        i, bottom_h_, bottom_w_, spp_param);
-
-    pooling_layers_.push_back(shared_ptr<PoolingLayer<Dtype> > (
-        new PoolingLayer<Dtype>(pooling_param)));
-    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-
-    // flatten layer output holders setup
-    flatten_outputs_.push_back(new Blob<Dtype>());
-    flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
-    flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
-
-    // flatten layer setup
-    LayerParameter flatten_param;
-    flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
-    flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-
-    // concat layer input holders setup
-    concat_bottom_vec_.push_back(flatten_outputs_[i]);
-  }
-
-  // concat layer setup
-  LayerParameter concat_param;
-  concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
-  concat_layer_->SetUp(concat_bottom_vec_, top);
+	const vector<Blob<Dtype>*>& top) {
+	SPPParameter spp_param = this->layer_param_.spp_param();
+
+	bottom_h_ = bottom[0]->height();
+	bottom_w_ = bottom[0]->width();
+	CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
+	CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
+
+	pyramid_height_ = spp_param.pyramid_height();
+	split_top_vec_.clear();
+	pooling_bottom_vecs_.clear();
+	pooling_layers_.clear();
+	pooling_top_vecs_.clear();
+	pooling_outputs_.clear();
+	flatten_layers_.clear();
+	flatten_top_vecs_.clear();
+	flatten_outputs_.clear();
+	concat_bottom_vec_.clear();
+
+	// split layer output holders setup
+	for (int i = 0; i < pyramid_height_; i++) {
+		split_top_vec_.push_back(new Blob<Dtype>());
+	}
+
+	// split layer setup
+	LayerParameter split_param;
+	split_layer_.reset(new SplitLayer<Dtype>(split_param));
+	split_layer_->SetUp(bottom, split_top_vec_);
+
+	for (int i = 0; i < pyramid_height_; i++) {
+		// pooling layer input holders setup
+		pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
+		pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
+
+		// pooling layer output holders setup
+		pooling_outputs_.push_back(new Blob<Dtype>());
+		pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+		pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
+
+		// pooling layer setup
+		LayerParameter pooling_param = GetPoolingParam(
+			i, bottom_h_, bottom_w_, spp_param);
+
+		pooling_layers_.push_back(shared_ptr < PoolingLayer<Dtype> > (
+			new PoolingLayer<Dtype>(pooling_param)));
+		pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+
+		// flatten layer output holders setup
+		flatten_outputs_.push_back(new Blob<Dtype>());
+		flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+		flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
+
+		// flatten layer setup
+		LayerParameter flatten_param;
+		flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
+		flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+
+		// concat layer input holders setup
+		concat_bottom_vec_.push_back(flatten_outputs_[i]);
+	}
+
+	// concat layer setup
+	LayerParameter concat_param;
+	concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
+	concat_layer_->SetUp(concat_bottom_vec_, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  channels_ = bottom[0]->channels();
-  bottom_h_ = bottom[0]->height();
-  bottom_w_ = bottom[0]->width();
-  SPPParameter spp_param = this->layer_param_.spp_param();
-  split_layer_->Reshape(bottom, split_top_vec_);
-  for (int i = 0; i < pyramid_height_; i++) {
-    LayerParameter pooling_param = GetPoolingParam(
-        i, bottom_h_, bottom_w_, spp_param);
-
-    pooling_layers_[i].reset(
-        new PoolingLayer<Dtype>(pooling_param));
-    pooling_layers_[i]->SetUp(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    pooling_layers_[i]->Reshape(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    flatten_layers_[i]->Reshape(
-        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-  }
-  concat_layer_->Reshape(concat_bottom_vec_, top);
+	const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+		<< "corresponding to (num, channels, height, width)";
+	channels_ = bottom[0]->channels();
+	bottom_h_ = bottom[0]->height();
+	bottom_w_ = bottom[0]->width();
+	SPPParameter spp_param = this->layer_param_.spp_param();
+	split_layer_->Reshape(bottom, split_top_vec_);
+	for (int i = 0; i < pyramid_height_; i++) {
+		LayerParameter pooling_param = GetPoolingParam(
+			i, bottom_h_, bottom_w_, spp_param);
+
+		pooling_layers_[i].reset(
+			new PoolingLayer<Dtype>(pooling_param));
+		pooling_layers_[i]->SetUp(
+			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+		pooling_layers_[i]->Reshape(
+			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+		flatten_layers_[i]->Reshape(
+			*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+	}
+	concat_layer_->Reshape(concat_bottom_vec_, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  split_layer_->Forward(bottom, split_top_vec_);
-  for (int i = 0; i < pyramid_height_; i++) {
-    pooling_layers_[i]->Forward(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    flatten_layers_[i]->Forward(
-        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-  }
-  concat_layer_->Forward(concat_bottom_vec_, top);
+	const vector<Blob<Dtype>*>& top) {
+	split_layer_->Forward(bottom, split_top_vec_);
+	for (int i = 0; i < pyramid_height_; i++) {
+		pooling_layers_[i]->Forward(
+			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+		flatten_layers_[i]->Forward(
+			*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+	}
+	concat_layer_->Forward(concat_bottom_vec_, top);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  vector<bool> concat_propagate_down(pyramid_height_, true);
-  concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
-  for (int i = 0; i < pyramid_height_; i++) {
-    flatten_layers_[i]->Backward(
-        *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
-    pooling_layers_[i]->Backward(
-        *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
-  }
-  split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (!propagate_down[0]) {
+		return;
+	}
+	vector<bool> concat_propagate_down(pyramid_height_, true);
+	concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
+	for (int i = 0; i < pyramid_height_; i++) {
+		flatten_layers_[i]->Backward(
+			*flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
+		pooling_layers_[i]->Backward(
+			*pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
+	}
+	split_layer_->Backward(split_top_vec_, propagate_down, bottom);
 }
 
-
-INSTANTIATE_CLASS(SPPLayer);
-REGISTER_LAYER_CLASS(SPP);
+INSTANTIATE_CLASS (SPPLayer);
+REGISTER_LAYER_CLASS (SPP);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index a922adbd..16405761 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -10,63 +10,61 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = tanh(bottom_data[i]);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	for (int i = 0; i < count; ++i) {
+		top_data[i] = tanh(bottom_data[i]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->cpu_data();
-    const Dtype* top_diff = top[0]->cpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const int count = bottom[0]->count();
-    Dtype tanhx;
-    for (int i = 0; i < count; ++i) {
-      tanhx = top_data[i];
-      bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx);
-    }
-  }
+	const vector<bool>& propagate_down,
+	const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_data = top[0]->cpu_data();
+		const Dtype* top_diff = top[0]->cpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+		const int count = bottom[0]->count();
+		Dtype tanhx;
+		for (int i = 0; i < count; ++i) {
+			tanhx = top_data[i];
+			bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx);
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  TanHForward(count, bottom_data, top_data);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	TanHForward(count, bottom_data, top_data);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    TanHBackward(count, top_diff, top_data, bottom_diff);
+	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+	if (propagate_down[0]) {
+		const Dtype* top_data = top[0]->gpu_data();
+		const Dtype* top_diff = top[0]->gpu_diff();
+		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+		const int count = bottom[0]->count();
+		// NOLINT_NEXT_LINE(whitespace/operators)
+		TanHBackward(count, top_diff, top_data, bottom_diff);
+	}
 }
-}
-
-
 
 #ifdef CPU_ONLY
 STUB_GPU(TanHLayer);
 #endif
 
-INSTANTIATE_CLASS(TanHLayer);
+INSTANTIATE_CLASS (TanHLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index b3e1bea7..ca14de00 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -4,42 +4,41 @@
 #include "caffe/vision_layers.hpp"
 #include "caffe/util/ocl_wrapper.hpp"
 
-
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 void ThresholdLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  threshold_ = this->layer_param_.threshold_param().threshold();
+	const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	threshold_ = this->layer_param_.threshold_param().threshold();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
-  for (int i = 0; i < count; ++i) {
-    top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0);
-  }
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->cpu_data();
+	Dtype* top_data = top[0]->mutable_cpu_data();
+	const int count = bottom[0]->count();
+	for (int i = 0; i < count; ++i) {
+		top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top){
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ThresholdForward(count, threshold_, bottom_data, top_data);
+	const vector<Blob<Dtype>*>& top) {
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	const int count = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	ThresholdForward(count, threshold_, bottom_data, top_data);
 }
 
 #ifdef CPU_ONLY
 STUB_GPU_FORWARD(ThresholdLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(ThresholdLayer);
-REGISTER_LAYER_CLASS(Threshold);
+INSTANTIATE_CLASS (ThresholdLayer);
+REGISTER_LAYER_CLASS (Threshold);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index cc7dc79d..0525b640 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -25,409 +25,410 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+	this->JoinPrefetchThread();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // LayerSetUp runs through the window_file and creates two structures
-  // that hold windows: one for foreground (object) windows and one
-  // for background (non-object) windows. We use an overlap threshold
-  // to decide which is which.
-
-  // window_file format
-  // repeated:
-  //    # image_index
-  //    img_path (abs path)
-  //    channels
-  //    height
-  //    width
-  //    num_windows
-  //    class_index overlap x1 y1 x2 y2
-
-  LOG(INFO) << "Window data layer:" << std::endl
-      << "  foreground (object) overlap threshold: "
-      << this->layer_param_.window_data_param().fg_threshold() << std::endl
-      << "  background (non-object) overlap threshold: "
-      << this->layer_param_.window_data_param().bg_threshold() << std::endl
-      << "  foreground sampling fraction: "
-      << this->layer_param_.window_data_param().fg_fraction() << std::endl
-      << "  cache_images: "
-      << this->layer_param_.window_data_param().cache_images() << std::endl
-      << "  root_folder: "
-      << this->layer_param_.window_data_param().root_folder();
-
-  cache_images_ = this->layer_param_.window_data_param().cache_images();
-  string root_folder = this->layer_param_.window_data_param().root_folder();
-
-  const bool prefetch_needs_rand =
-      this->transform_param_.mirror() ||
-      this->transform_param_.crop_size();
-  if (prefetch_needs_rand) {
-    const unsigned int prefetch_rng_seed = caffe_rng_rand();
-    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-  } else {
-    prefetch_rng_.reset();
-  }
-
-  std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
-  CHECK(infile.good()) << "Failed to open window file "
-      << this->layer_param_.window_data_param().source() << std::endl;
-
-  map<int, int> label_hist;
-  label_hist.insert(std::make_pair(0, 0));
-
-  string hashtag;
-  int image_index, channels;
-  if (!(infile >> hashtag >> image_index)) {
-    LOG(FATAL) << "Window file is empty";
-  }
-  do {
-    CHECK_EQ(hashtag, "#");
-    // read image path
-    string image_path;
-    infile >> image_path;
-    image_path = root_folder + image_path;
-    // read image dimensions
-    vector<int> image_size(3);
-    infile >> image_size[0] >> image_size[1] >> image_size[2];
-    channels = image_size[0];
-    image_database_.push_back(std::make_pair(image_path, image_size));
-
-    if (cache_images_) {
-      Datum datum;
-      if (!ReadFileToDatum(image_path, &datum)) {
-        LOG(ERROR) << "Could not open or find file " << image_path;
-        return;
-      }
-      image_database_cache_.push_back(std::make_pair(image_path, datum));
-    }
-    // read each box
-    int num_windows;
-    infile >> num_windows;
-    const float fg_threshold =
-        this->layer_param_.window_data_param().fg_threshold();
-    const float bg_threshold =
-        this->layer_param_.window_data_param().bg_threshold();
-    for (int i = 0; i < num_windows; ++i) {
-      int label, x1, y1, x2, y2;
-      float overlap;
-      infile >> label >> overlap >> x1 >> y1 >> x2 >> y2;
-
-      vector<float> window(WindowDataLayer::NUM);
-      window[WindowDataLayer::IMAGE_INDEX] = image_index;
-      window[WindowDataLayer::LABEL] = label;
-      window[WindowDataLayer::OVERLAP] = overlap;
-      window[WindowDataLayer::X1] = x1;
-      window[WindowDataLayer::Y1] = y1;
-      window[WindowDataLayer::X2] = x2;
-      window[WindowDataLayer::Y2] = y2;
-
-      // add window to foreground list or background list
-      if (overlap >= fg_threshold) {
-        int label = window[WindowDataLayer::LABEL];
-        CHECK_GT(label, 0);
-        fg_windows_.push_back(window);
-        label_hist.insert(std::make_pair(label, 0));
-        label_hist[label]++;
-      } else if (overlap < bg_threshold) {
-        // background window, force label and overlap to 0
-        window[WindowDataLayer::LABEL] = 0;
-        window[WindowDataLayer::OVERLAP] = 0;
-        bg_windows_.push_back(window);
-        label_hist[0]++;
-      }
-    }
-
-    if (image_index % 100 == 0) {
-      LOG(INFO) << "num: " << image_index << " "
-          << image_path << " "
-          << image_size[0] << " "
-          << image_size[1] << " "
-          << image_size[2] << " "
-          << "windows to process: " << num_windows;
-    }
-  } while (infile >> hashtag >> image_index);
-
-  LOG(INFO) << "Number of images: " << image_index+1;
-
-  for (map<int, int>::iterator it = label_hist.begin();
-      it != label_hist.end(); ++it) {
-    LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
-              << " samples";
-  }
-
-  LOG(INFO) << "Amount of context padding: "
-      << this->layer_param_.window_data_param().context_pad();
-
-  LOG(INFO) << "Crop mode: "
-      << this->layer_param_.window_data_param().crop_mode();
-
-  // image
-  const int crop_size = this->transform_param_.crop_size();
-  CHECK_GT(crop_size, 0);
-  const int batch_size = this->layer_param_.window_data_param().batch_size();
-  top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
-
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
-      << top[0]->channels() << "," << top[0]->height() << ","
-      << top[0]->width();
-  // label
-  vector<int> label_shape(1, batch_size);
-  top[1]->Reshape(label_shape);
-  this->prefetch_label_.Reshape(label_shape);
-
-  // data mean
-  has_mean_file_ = this->transform_param_.has_mean_file();
-  has_mean_values_ = this->transform_param_.mean_value_size() > 0;
-  if (has_mean_file_) {
-    const string& mean_file =
-          this->transform_param_.mean_file();
-    LOG(INFO) << "Loading mean file from: " << mean_file;
-    BlobProto blob_proto;
-    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-    data_mean_.FromProto(blob_proto);
-  }
-  if (has_mean_values_) {
-    CHECK(has_mean_file_ == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
-    for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
-      mean_values_.push_back(this->transform_param_.mean_value(c));
-    }
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
-     "Specify either 1 mean_value or as many as channels: " << channels;
-    if (channels > 1 && mean_values_.size() == 1) {
-      // Replicate the mean_value for simplicity
-      for (int c = 1; c < channels; ++c) {
-        mean_values_.push_back(mean_values_[0]);
-      }
-    }
-  }
+	const vector<Blob<Dtype>*>& top) {
+	// LayerSetUp runs through the window_file and creates two structures
+	// that hold windows: one for foreground (object) windows and one
+	// for background (non-object) windows. We use an overlap threshold
+	// to decide which is which.
+
+	// window_file format
+	// repeated:
+	//    # image_index
+	//    img_path (abs path)
+	//    channels
+	//    height
+	//    width
+	//    num_windows
+	//    class_index overlap x1 y1 x2 y2
+
+	LOG(INFO) << "Window data layer:" << std::endl
+		<< "  foreground (object) overlap threshold: "
+		<< this->layer_param_.window_data_param().fg_threshold() << std::endl
+		<< "  background (non-object) overlap threshold: "
+		<< this->layer_param_.window_data_param().bg_threshold() << std::endl
+		<< "  foreground sampling fraction: "
+		<< this->layer_param_.window_data_param().fg_fraction() << std::endl
+		<< "  cache_images: "
+		<< this->layer_param_.window_data_param().cache_images() << std::endl
+		<< "  root_folder: "
+		<< this->layer_param_.window_data_param().root_folder();
+
+	cache_images_ = this->layer_param_.window_data_param().cache_images();
+	string root_folder = this->layer_param_.window_data_param().root_folder();
+
+	const bool prefetch_needs_rand =
+		this->transform_param_.mirror() ||
+			this->transform_param_.crop_size();
+	if (prefetch_needs_rand) {
+		const unsigned int prefetch_rng_seed = caffe_rng_rand();
+		prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+	} else {
+		prefetch_rng_.reset();
+	}
+
+	std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
+	CHECK(infile.good()) << "Failed to open window file "
+		<< this->layer_param_.window_data_param().source() << std::endl;
+
+	map<int, int> label_hist;
+	label_hist.insert(std::make_pair(0, 0));
+
+	string hashtag;
+	int image_index, channels;
+	if (!(infile >> hashtag >> image_index)) {
+		LOG(FATAL) << "Window file is empty";
+	}
+	do {
+		CHECK_EQ(hashtag, "#");
+		// read image path
+		string image_path;
+		infile >> image_path;
+		image_path = root_folder + image_path;
+		// read image dimensions
+		vector<int> image_size(3);
+		infile >> image_size[0] >> image_size[1] >> image_size[2];
+		channels = image_size[0];
+		image_database_.push_back(std::make_pair(image_path, image_size));
+
+		if (cache_images_) {
+			Datum datum;
+			if (!ReadFileToDatum(image_path, &datum)) {
+				LOG(ERROR) << "Could not open or find file " << image_path;
+				return;
+			}
+			image_database_cache_.push_back(std::make_pair(image_path, datum));
+		}
+		// read each box
+		int num_windows;
+		infile >> num_windows;
+		const float fg_threshold =
+			this->layer_param_.window_data_param().fg_threshold();
+		const float bg_threshold =
+			this->layer_param_.window_data_param().bg_threshold();
+		for (int i = 0; i < num_windows; ++i) {
+			int label, x1, y1, x2, y2;
+			float overlap;
+			infile >> label >> overlap >> x1 >> y1 >> x2 >> y2;
+
+			vector<float> window(WindowDataLayer::NUM);
+			window[WindowDataLayer::IMAGE_INDEX] = image_index;
+			window[WindowDataLayer::LABEL] = label;
+			window[WindowDataLayer::OVERLAP] = overlap;
+			window[WindowDataLayer::X1] = x1;
+			window[WindowDataLayer::Y1] = y1;
+			window[WindowDataLayer::X2] = x2;
+			window[WindowDataLayer::Y2] = y2;
+
+			// add window to foreground list or background list
+			if (overlap >= fg_threshold) {
+				int label = window[WindowDataLayer::LABEL];
+				CHECK_GT(label, 0);
+				fg_windows_.push_back(window);
+				label_hist.insert(std::make_pair(label, 0));
+				label_hist[label]++;
+			} else if (overlap < bg_threshold) {
+				// background window, force label and overlap to 0
+				window[WindowDataLayer::LABEL] = 0;
+				window[WindowDataLayer::OVERLAP] = 0;
+				bg_windows_.push_back(window);
+				label_hist[0]++;
+			}
+		}
+
+		if (image_index % 100 == 0) {
+			LOG(INFO) << "num: " << image_index << " "
+				<< image_path << " "
+				<< image_size[0] << " "
+				<< image_size[1] << " "
+				<< image_size[2] << " "
+				<< "windows to process: " << num_windows;
+		}
+	} while (infile >> hashtag >> image_index);
+
+	LOG(INFO) << "Number of images: " << image_index + 1;
+
+	for (map<int, int>::iterator it = label_hist.begin();
+		it != label_hist.end(); ++it) {
+		LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
+			<< " samples";
+	}
+
+	LOG(INFO) << "Amount of context padding: "
+		<< this->layer_param_.window_data_param().context_pad();
+
+	LOG(INFO) << "Crop mode: "
+		<< this->layer_param_.window_data_param().crop_mode();
+
+	// image
+	const int crop_size = this->transform_param_.crop_size();
+	CHECK_GT(crop_size, 0);
+	const int batch_size = this->layer_param_.window_data_param().batch_size();
+	top[0]->Reshape(batch_size, channels, crop_size, crop_size);
+	this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
+
+	LOG(INFO) << "output data size: " << top[0]->num() << ","
+		<< top[0]->channels() << "," << top[0]->height() << ","
+		<< top[0]->width();
+	// label
+	vector<int> label_shape(1, batch_size);
+	top[1]->Reshape(label_shape);
+	this->prefetch_label_.Reshape(label_shape);
+
+	// data mean
+	has_mean_file_ = this->transform_param_.has_mean_file();
+	has_mean_values_ = this->transform_param_.mean_value_size() > 0;
+	if (has_mean_file_) {
+		const string& mean_file =
+			this->transform_param_.mean_file();
+		LOG(INFO) << "Loading mean file from: " << mean_file;
+		BlobProto blob_proto;
+		ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+		data_mean_.FromProto(blob_proto);
+	}
+	if (has_mean_values_) {
+		CHECK(has_mean_file_ == false) <<
+			"Cannot specify mean_file and mean_value at the same time";
+		for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
+			mean_values_.push_back(this->transform_param_.mean_value(c));
+		}
+		CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
+			"Specify either 1 mean_value or as many as channels: " << channels;
+		if (channels > 1 && mean_values_.size() == 1) {
+			// Replicate the mean_value for simplicity
+			for (int c = 1; c < channels; ++c) {
+				mean_values_.push_back(mean_values_[0]);
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
-  CHECK(prefetch_rng_);
-  caffe::rng_t* prefetch_rng =
-      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-  return (*prefetch_rng)();
+	CHECK (prefetch_rng_);
+	caffe::rng_t* prefetch_rng =
+		static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+	return (*prefetch_rng)();
 }
 
 // Thread fetching the data
-template <typename Dtype>
+template<typename Dtype>
 void WindowDataLayer<Dtype>::InternalThreadEntry() {
-  // At each iteration, sample N windows where N*p are foreground (object)
-  // windows and N*(1-p) are background (non-object) windows
-  CPUTimer batch_timer;
-  batch_timer.Start();
-  double read_time = 0;
-  double trans_time = 0;
-  CPUTimer timer;
-  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
-  Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
-  const Dtype scale = this->layer_param_.window_data_param().scale();
-  const int batch_size = this->layer_param_.window_data_param().batch_size();
-  const int context_pad = this->layer_param_.window_data_param().context_pad();
-  const int crop_size = this->transform_param_.crop_size();
-  const bool mirror = this->transform_param_.mirror();
-  const float fg_fraction =
-      this->layer_param_.window_data_param().fg_fraction();
-  Dtype* mean = NULL;
-  int mean_off = 0;
-  int mean_width = 0;
-  int mean_height = 0;
-  if (this->has_mean_file_) {
-    mean = this->data_mean_.mutable_cpu_data();
-    mean_off = (this->data_mean_.width() - crop_size) / 2;
-    mean_width = this->data_mean_.width();
-    mean_height = this->data_mean_.height();
-  }
-  cv::Size cv_crop_size(crop_size, crop_size);
-  const string& crop_mode = this->layer_param_.window_data_param().crop_mode();
-
-  bool use_square = (crop_mode == "square") ? true : false;
-
-  // zero out batch
-  caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
-
-  const int num_fg = static_cast<int>(static_cast<float>(batch_size)
-      * fg_fraction);
-  const int num_samples[2] = { batch_size - num_fg, num_fg };
-
-  int item_id = 0;
-  // sample from bg set then fg set
-  for (int is_fg = 0; is_fg < 2; ++is_fg) {
-    for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
-      // sample a window
-      timer.Start();
-      const unsigned int rand_index = PrefetchRand();
-      vector<float> window = (is_fg) ?
-          fg_windows_[rand_index % fg_windows_.size()] :
-          bg_windows_[rand_index % bg_windows_.size()];
-
-      bool do_mirror = mirror && PrefetchRand() % 2;
-
-      // load the image containing the window
-      pair<std::string, vector<int> > image =
-          image_database_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
-
-      cv::Mat cv_img;
-      if (this->cache_images_) {
-        pair<std::string, Datum> image_cached =
-          image_database_cache_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
-        cv_img = DecodeDatumToCVMat(image_cached.second, true);
-      } else {
-        cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
-        if (!cv_img.data) {
-          LOG(ERROR) << "Could not open or find file " << image.first;
-          return;
-        }
-      }
-      read_time += timer.MicroSeconds();
-      timer.Start();
-      const int channels = cv_img.channels();
-
-      // crop window out of image and warp it
-      int x1 = window[WindowDataLayer<Dtype>::X1];
-      int y1 = window[WindowDataLayer<Dtype>::Y1];
-      int x2 = window[WindowDataLayer<Dtype>::X2];
-      int y2 = window[WindowDataLayer<Dtype>::Y2];
-
-      int pad_w = 0;
-      int pad_h = 0;
-      if (context_pad > 0 || use_square) {
-        // scale factor by which to expand the original region
-        // such that after warping the expanded region to crop_size x crop_size
-        // there's exactly context_pad amount of padding on each side
-        Dtype context_scale = static_cast<Dtype>(crop_size) /
-            static_cast<Dtype>(crop_size - 2*context_pad);
-
-        // compute the expanded region
-        Dtype half_height = static_cast<Dtype>(y2-y1+1)/2.0;
-        Dtype half_width = static_cast<Dtype>(x2-x1+1)/2.0;
-        Dtype center_x = static_cast<Dtype>(x1) + half_width;
-        Dtype center_y = static_cast<Dtype>(y1) + half_height;
-        if (use_square) {
-          if (half_height > half_width) {
-            half_width = half_height;
-          } else {
-            half_height = half_width;
-          }
-        }
-        x1 = static_cast<int>(round(center_x - half_width*context_scale));
-        x2 = static_cast<int>(round(center_x + half_width*context_scale));
-        y1 = static_cast<int>(round(center_y - half_height*context_scale));
-        y2 = static_cast<int>(round(center_y + half_height*context_scale));
-
-        // the expanded region may go outside of the image
-        // so we compute the clipped (expanded) region and keep track of
-        // the extent beyond the image
-        int unclipped_height = y2-y1+1;
-        int unclipped_width = x2-x1+1;
-        int pad_x1 = std::max(0, -x1);
-        int pad_y1 = std::max(0, -y1);
-        int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
-        int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
-        // clip bounds
-        x1 = x1 + pad_x1;
-        x2 = x2 - pad_x2;
-        y1 = y1 + pad_y1;
-        y2 = y2 - pad_y2;
-        CHECK_GT(x1, -1);
-        CHECK_GT(y1, -1);
-        CHECK_LT(x2, cv_img.cols);
-        CHECK_LT(y2, cv_img.rows);
-
-        int clipped_height = y2-y1+1;
-        int clipped_width = x2-x1+1;
-
-        // scale factors that would be used to warp the unclipped
-        // expanded region
-        Dtype scale_x =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_width);
-        Dtype scale_y =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_height);
-
-        // size to warp the clipped expanded region to
-        cv_crop_size.width =
-            static_cast<int>(round(static_cast<Dtype>(clipped_width)*scale_x));
-        cv_crop_size.height =
-            static_cast<int>(round(static_cast<Dtype>(clipped_height)*scale_y));
-        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1)*scale_x));
-        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2)*scale_x));
-        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1)*scale_y));
-        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2)*scale_y));
-
-        pad_h = pad_y1;
-        // if we're mirroring, we mirror the padding too (to be pedantic)
-        if (do_mirror) {
-          pad_w = pad_x2;
-        } else {
-          pad_w = pad_x1;
-        }
-
-        // ensure that the warped, clipped region plus the padding fits in the
-        // crop_size x crop_size image (it might not due to rounding)
-        if (pad_h + cv_crop_size.height > crop_size) {
-          cv_crop_size.height = crop_size - pad_h;
-        }
-        if (pad_w + cv_crop_size.width > crop_size) {
-          cv_crop_size.width = crop_size - pad_w;
-        }
-      }
-
-      cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1);
-      cv::Mat cv_cropped_img = cv_img(roi);
-      cv::resize(cv_cropped_img, cv_cropped_img,
-          cv_crop_size, 0, 0, cv::INTER_LINEAR);
-
-      // horizontal flip at random
-      if (do_mirror) {
-        cv::flip(cv_cropped_img, cv_cropped_img, 1);
-      }
-
-      // copy the warped window into top_data
-      for (int h = 0; h < cv_cropped_img.rows; ++h) {
-        const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
-        int img_index = 0;
-        for (int w = 0; w < cv_cropped_img.cols; ++w) {
-          for (int c = 0; c < channels; ++c) {
-            int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
-                     * crop_size + w + pad_w;
-            // int top_index = (c * height + h) * width + w;
-            Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-            if (this->has_mean_file_) {
-              int mean_index = (c * mean_height + h + mean_off + pad_h)
-                           * mean_width + w + mean_off + pad_w;
-              top_data[top_index] = (pixel - mean[mean_index]) * scale;
-            } else {
-              if (this->has_mean_values_) {
-                top_data[top_index] = (pixel - this->mean_values_[c]) * scale;
-              } else {
-                top_data[top_index] = pixel * scale;
-              }
-            }
-          }
-        }
-      }
-      trans_time += timer.MicroSeconds();
-      // get window label
-      top_label[item_id] = window[WindowDataLayer<Dtype>::LABEL];
-
-      item_id++;
-    }
-  }
-  batch_timer.Stop();
-  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+	// At each iteration, sample N windows where N*p are foreground (object)
+	// windows and N*(1-p) are background (non-object) windows
+	CPUTimer batch_timer;
+	batch_timer.Start();
+	double read_time = 0;
+	double trans_time = 0;
+	CPUTimer timer;
+	Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+	Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
+	const Dtype scale = this->layer_param_.window_data_param().scale();
+	const int batch_size = this->layer_param_.window_data_param().batch_size();
+	const int context_pad = this->layer_param_.window_data_param().context_pad();
+	const int crop_size = this->transform_param_.crop_size();
+	const bool mirror = this->transform_param_.mirror();
+	const float fg_fraction =
+		this->layer_param_.window_data_param().fg_fraction();
+	Dtype* mean = NULL;
+	int mean_off = 0;
+	int mean_width = 0;
+	int mean_height = 0;
+	if (this->has_mean_file_) {
+		mean = this->data_mean_.mutable_cpu_data();
+		mean_off = (this->data_mean_.width() - crop_size) / 2;
+		mean_width = this->data_mean_.width();
+		mean_height = this->data_mean_.height();
+	}
+	cv::Size cv_crop_size(crop_size, crop_size);
+	const string& crop_mode = this->layer_param_.window_data_param().crop_mode();
+
+	bool use_square = (crop_mode == "square") ? true : false;
+
+	// zero out batch
+	caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
+
+	const int num_fg = static_cast<int>(static_cast<float>(batch_size)
+		* fg_fraction);
+	const int num_samples[2] = { batch_size - num_fg, num_fg };
+
+	int item_id = 0;
+	// sample from bg set then fg set
+	for (int is_fg = 0; is_fg < 2; ++is_fg) {
+		for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
+			// sample a window
+			timer.Start();
+			const unsigned int rand_index = PrefetchRand();
+			vector<float> window =
+				(is_fg) ?
+									fg_windows_[rand_index % fg_windows_.size()] :
+									bg_windows_[rand_index % bg_windows_.size()];
+
+			bool do_mirror = mirror && PrefetchRand() % 2;
+
+			// load the image containing the window
+			pair<std::string, vector<int> > image =
+				image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+
+			cv::Mat cv_img;
+			if (this->cache_images_) {
+				pair < std::string, Datum > image_cached =
+					image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+				cv_img = DecodeDatumToCVMat(image_cached.second, true);
+			} else {
+				cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
+				if (!cv_img.data) {
+					LOG(ERROR) << "Could not open or find file " << image.first;
+					return;
+				}
+			}
+			read_time += timer.MicroSeconds();
+			timer.Start();
+			const int channels = cv_img.channels();
+
+			// crop window out of image and warp it
+			int x1 = window[WindowDataLayer < Dtype > ::X1];
+			int y1 = window[WindowDataLayer < Dtype > ::Y1];
+			int x2 = window[WindowDataLayer < Dtype > ::X2];
+			int y2 = window[WindowDataLayer < Dtype > ::Y2];
+
+			int pad_w = 0;
+			int pad_h = 0;
+			if (context_pad > 0 || use_square) {
+				// scale factor by which to expand the original region
+				// such that after warping the expanded region to crop_size x crop_size
+				// there's exactly context_pad amount of padding on each side
+				Dtype context_scale = static_cast<Dtype>(crop_size) /
+					static_cast<Dtype>(crop_size - 2 * context_pad);
+
+				// compute the expanded region
+				Dtype half_height = static_cast<Dtype>(y2 - y1 + 1) / 2.0;
+				Dtype half_width = static_cast<Dtype>(x2 - x1 + 1) / 2.0;
+				Dtype center_x = static_cast<Dtype>(x1) + half_width;
+				Dtype center_y = static_cast<Dtype>(y1) + half_height;
+				if (use_square) {
+					if (half_height > half_width) {
+						half_width = half_height;
+					} else {
+						half_height = half_width;
+					}
+				}
+				x1 = static_cast<int>(round(center_x - half_width * context_scale));
+				x2 = static_cast<int>(round(center_x + half_width * context_scale));
+				y1 = static_cast<int>(round(center_y - half_height * context_scale));
+				y2 = static_cast<int>(round(center_y + half_height * context_scale));
+
+				// the expanded region may go outside of the image
+				// so we compute the clipped (expanded) region and keep track of
+				// the extent beyond the image
+				int unclipped_height = y2 - y1 + 1;
+				int unclipped_width = x2 - x1 + 1;
+				int pad_x1 = std::max(0, -x1);
+				int pad_y1 = std::max(0, -y1);
+				int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
+				int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
+				// clip bounds
+				x1 = x1 + pad_x1;
+				x2 = x2 - pad_x2;
+				y1 = y1 + pad_y1;
+				y2 = y2 - pad_y2;
+				CHECK_GT(x1, -1);
+				CHECK_GT(y1, -1);
+				CHECK_LT(x2, cv_img.cols);
+				CHECK_LT(y2, cv_img.rows);
+
+				int clipped_height = y2 - y1 + 1;
+				int clipped_width = x2 - x1 + 1;
+
+				// scale factors that would be used to warp the unclipped
+				// expanded region
+				Dtype scale_x =
+					static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_width);
+				Dtype scale_y =
+					static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_height);
+
+				// size to warp the clipped expanded region to
+				cv_crop_size.width =
+					static_cast<int>(round(static_cast<Dtype>(clipped_width) * scale_x));
+				cv_crop_size.height =
+					static_cast<int>(round(static_cast<Dtype>(clipped_height) * scale_y));
+				pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1) * scale_x));
+				pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2) * scale_x));
+				pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1) * scale_y));
+				pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2) * scale_y));
+
+				pad_h = pad_y1;
+				// if we're mirroring, we mirror the padding too (to be pedantic)
+				if (do_mirror) {
+					pad_w = pad_x2;
+				} else {
+					pad_w = pad_x1;
+				}
+
+				// ensure that the warped, clipped region plus the padding fits in the
+				// crop_size x crop_size image (it might not due to rounding)
+				if (pad_h + cv_crop_size.height > crop_size) {
+					cv_crop_size.height = crop_size - pad_h;
+				}
+				if (pad_w + cv_crop_size.width > crop_size) {
+					cv_crop_size.width = crop_size - pad_w;
+				}
+			}
+
+			cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1);
+			cv::Mat cv_cropped_img = cv_img(roi);
+			cv::resize(cv_cropped_img, cv_cropped_img,
+				cv_crop_size, 0, 0, cv::INTER_LINEAR);
+
+			// horizontal flip at random
+			if (do_mirror) {
+				cv::flip(cv_cropped_img, cv_cropped_img, 1);
+			}
+
+			// copy the warped window into top_data
+			for (int h = 0; h < cv_cropped_img.rows; ++h) {
+				const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
+				int img_index = 0;
+				for (int w = 0; w < cv_cropped_img.cols; ++w) {
+					for (int c = 0; c < channels; ++c) {
+						int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
+							* crop_size + w + pad_w;
+						// int top_index = (c * height + h) * width + w;
+						Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
+						if (this->has_mean_file_) {
+							int mean_index = (c * mean_height + h + mean_off + pad_h)
+								* mean_width + w + mean_off + pad_w;
+							top_data[top_index] = (pixel - mean[mean_index]) * scale;
+						} else {
+							if (this->has_mean_values_) {
+								top_data[top_index] = (pixel - this->mean_values_[c]) * scale;
+							} else {
+								top_data[top_index] = pixel * scale;
+							}
+						}
+					}
+				}
+			}
+			trans_time += timer.MicroSeconds();
+			// get window label
+			top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL];
+
+			item_id++;
+		}
+	}
+	batch_timer.Stop();
+	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(WindowDataLayer);
-REGISTER_LAYER_CLASS(WindowData);
+INSTANTIATE_CLASS (WindowDataLayer);
+REGISTER_LAYER_CLASS (WindowData);
 
 }  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index e070d774..53ec5461 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -19,863 +19,901 @@
 
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 Net<Dtype>::Net(const NetParameter& param) {
-  Init(param);
+	Init(param);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Net<Dtype>::Net(const string& param_file, Phase phase) {
-  NetParameter param;
-  ReadNetParamsFromTextFileOrDie(param_file, &param);
-  param.mutable_state()->set_phase(phase);
-  Init(param);
+	NetParameter param;
+	ReadNetParamsFromTextFileOrDie(param_file, &param);
+	param.mutable_state()->set_phase(phase);
+	Init(param);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-  // Set phase from the state.
-  phase_ = in_param.state().phase();
-  // Filter layers based on their include/exclude rules and
-  // the current NetState.
-  NetParameter filtered_param;
-  FilterNet(in_param, &filtered_param);
-  LOG(INFO) << "Initializing net from parameters: " << std::endl
-            << filtered_param.DebugString();
-  // Create a copy of filtered_param with splits added where necessary.
-  NetParameter param;
-  InsertSplits(filtered_param, &param);
-  // Basically, build all the layers and set up their connections.
-  name_ = param.name();
-  map<string, int> blob_name_to_idx;
-  set<string> available_blobs;
-  CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
-      << "Must specify either input_shape OR deprecated input_dim, not both.";
-  if (param.input_dim_size() > 0) {
-    // Deprecated 4D dimensions.
-    CHECK_EQ(param.input_size() * 4, param.input_dim_size())
-        << "Incorrect input blob dimension specifications.";
-  } else {
-    CHECK_EQ(param.input_size(), param.input_shape_size())
-        << "Exactly one input_shape must be specified per input.";
-  }
-  memory_used_ = 0;
-  // set the input blobs
-  for (int input_id = 0; input_id < param.input_size(); ++input_id) {
-    const int layer_id = -1;  // inputs have fake layer ID -1
-    AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
-  }
-  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-  // For each layer, set up its input and output
-  bottom_vecs_.resize(param.layer_size());
-  top_vecs_.resize(param.layer_size());
-  bottom_id_vecs_.resize(param.layer_size());
-  param_id_vecs_.resize(param.layer_size());
-  top_id_vecs_.resize(param.layer_size());
-  bottom_need_backward_.resize(param.layer_size());
-  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-    // Inherit phase from net if unset.
-    if (!param.layer(layer_id).has_phase()) {
-      param.mutable_layer(layer_id)->set_phase(phase_);
-    }
-    // Setup layer.
-    const LayerParameter& layer_param = param.layer(layer_id);
-    if (layer_param.propagate_down_size() > 0) {
-      CHECK_EQ(layer_param.propagate_down_size(),
-          layer_param.bottom_size())
-          << "propagate_down param must be specified "
-          << "either 0 or bottom_size times ";
-    }
-    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
-    layer_names_.push_back(layer_param.name());
-    LOG(INFO) << "Creating Layer " << layer_param.name();
-    bool need_backward = false;
-
-    // Figure out this layer's input and output
-    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
-         ++bottom_id) {
-      const int blob_id = AppendBottom(param, layer_id, bottom_id,
-                                       &available_blobs, &blob_name_to_idx);
-      // If a blob needs backward, this layer should provide it.
-      need_backward |= blob_need_backward_[blob_id];
-    }
-    int num_top = layer_param.top_size();
-    for (int top_id = 0; top_id < num_top; ++top_id) {
-      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
-    }
-    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
-    // specified fewer than the required number (as specified by
-    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
-    Layer<Dtype>* layer = layers_[layer_id].get();
-    if (layer->AutoTopBlobs()) {
-      const int needed_num_top =
-          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
-      for (; num_top < needed_num_top; ++num_top) {
-        // Add "anonymous" top blobs -- do not modify available_blobs or
-        // blob_name_to_idx as we don't want these blobs to be usable as input
-        // to other layers.
-        AppendTop(param, layer_id, num_top, NULL, NULL);
-      }
-    }
-    // After this layer is connected, set it up.
-    LOG(INFO) << "Setting up " << layer_names_[layer_id];
-    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
-        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
-      }
-      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
-      if (layer->loss(top_id)) {
-        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
-      }
-      memory_used_ += top_vecs_[layer_id][top_id]->count();
-    }
-    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-    const int param_size = layer_param.param_size();
-    const int num_param_blobs = layers_[layer_id]->blobs().size();
-    CHECK_LE(param_size, num_param_blobs)
-        << "Too many params specified for layer " << layer_param.name();
-    ParamSpec default_param_spec;
-    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-      const ParamSpec* param_spec = (param_id < param_size) ?
-          &layer_param.param(param_id) : &default_param_spec;
-      const bool param_need_backward = param_spec->lr_mult() > 0;
-      need_backward |= param_need_backward;
-      layers_[layer_id]->set_param_propagate_down(param_id,
-                                                  param_need_backward);
-    }
-    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-      AppendParam(param, layer_id, param_id);
-    }
-    // Finally, set the backward flag
-    layer_need_backward_.push_back(need_backward);
-    if (need_backward) {
-      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
-        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
-      }
-    }
-  }
-  // Go through the net backwards to determine which blobs contribute to the
-  // loss.  We can skip backward computation for blobs that don't contribute
-  // to the loss.
-  // Also checks if all bottom blobs don't need backward computation (possible
-  // because the skip_propagate_down param) and so we can skip bacward
-  // computation for the entire layer
-  set<string> blobs_under_loss;
-  set<string> blobs_skip_backp;
-  for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
-    bool layer_contributes_loss = false;
-    bool layer_skip_propagate_down = true;
-    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-      const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-      if (layers_[layer_id]->loss(top_id) ||
-          (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
-        layer_contributes_loss = true;
-      }
-      if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
-        layer_skip_propagate_down = false;
-      }
-      if (layer_contributes_loss && !layer_skip_propagate_down)
-        break;
-    }
-    // If this layer can skip backward computation, also all his bottom blobs
-    // don't need backpropagation
-    if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
-      layer_need_backward_[layer_id] = false;
-      for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-               ++bottom_id) {
-        bottom_need_backward_[layer_id][bottom_id] = false;
-      }
-    }
-    if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
-    if (layer_need_backward_[layer_id]) {
-      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-    } else {
-      LOG(INFO) << layer_names_[layer_id]
-                << " does not need backward computation.";
-    }
-    for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-         ++bottom_id) {
-      if (layer_contributes_loss) {
-        const string& blob_name =
-            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-        blobs_under_loss.insert(blob_name);
-      } else {
-        bottom_need_backward_[layer_id][bottom_id] = false;
-      }
-      if (!bottom_need_backward_[layer_id][bottom_id]) {
-        const string& blob_name =
-                   blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-        blobs_skip_backp.insert(blob_name);
-      }
-    }
-  }
-  // Handle force_backward if needed.
-  if (param.force_backward()) {
-    for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
-      layer_need_backward_[layer_id] = true;
-      for (int bottom_id = 0;
-           bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
-        bottom_need_backward_[layer_id][bottom_id] =
-            bottom_need_backward_[layer_id][bottom_id] ||
-            layers_[layer_id]->AllowForceBackward(bottom_id);
-        blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
-            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
-            bottom_need_backward_[layer_id][bottom_id];
-      }
-      for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-           ++param_id) {
-        layers_[layer_id]->set_param_propagate_down(param_id, true);
-      }
-    }
-  }
-  // In the end, all remaining blobs are considered output blobs.
-  for (set<string>::iterator it = available_blobs.begin();
-      it != available_blobs.end(); ++it) {
-    LOG(INFO) << "This network produces output " << *it;
-    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
-    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
-  }
-  for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
-    blob_names_index_[blob_names_[blob_id]] = blob_id;
-  }
-  for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
-    layer_names_index_[layer_names_[layer_id]] = layer_id;
-  }
-  GetLearningRateAndWeightDecay();
-  debug_info_ = param.debug_info();
-  LOG(INFO) << "Network initialization done.";
-  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-}
-
-template <typename Dtype>
+	// Set phase from the state.
+	phase_ = in_param.state().phase();
+	// Filter layers based on their include/exclude rules and
+	// the current NetState.
+	NetParameter filtered_param;
+	FilterNet(in_param, &filtered_param);
+	LOG(INFO) << "Initializing net from parameters: " << std::endl
+		<< filtered_param.DebugString();
+	// Create a copy of filtered_param with splits added where necessary.
+	NetParameter param;
+	InsertSplits(filtered_param, &param);
+	// Basically, build all the layers and set up their connections.
+	name_ = param.name();
+	map<string, int> blob_name_to_idx;
+	set < string > available_blobs;
+	CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
+		<< "Must specify either input_shape OR deprecated input_dim, not both.";
+	if (param.input_dim_size() > 0) {
+		// Deprecated 4D dimensions.
+		CHECK_EQ(param.input_size() * 4, param.input_dim_size())
+			<< "Incorrect input blob dimension specifications.";
+	} else {
+		CHECK_EQ(param.input_size(), param.input_shape_size())
+			<< "Exactly one input_shape must be specified per input.";
+	}
+	memory_used_ = 0;
+	// set the input blobs
+	for (int input_id = 0; input_id < param.input_size(); ++input_id) {
+		const int layer_id = -1;  // inputs have fake layer ID -1
+		AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
+	}
+	DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+	// For each layer, set up its input and output
+	bottom_vecs_.resize(param.layer_size());
+	top_vecs_.resize(param.layer_size());
+	bottom_id_vecs_.resize(param.layer_size());
+	param_id_vecs_.resize(param.layer_size());
+	top_id_vecs_.resize(param.layer_size());
+	bottom_need_backward_.resize(param.layer_size());
+	for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
+		// Inherit phase from net if unset.
+		if (!param.layer(layer_id).has_phase()) {
+			param.mutable_layer(layer_id)->set_phase(phase_);
+		}
+		// Setup layer.
+		const LayerParameter& layer_param = param.layer(layer_id);
+		if (layer_param.propagate_down_size() > 0) {
+			CHECK_EQ(layer_param.propagate_down_size(),
+				layer_param.bottom_size())
+				<< "propagate_down param must be specified "
+				<< "either 0 or bottom_size times ";
+		}
+		layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param));
+		layer_names_.push_back(layer_param.name());
+		LOG(INFO) << "Creating Layer " << layer_param.name();
+		bool need_backward = false;
+
+		// Figure out this layer's input and output
+		for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
+			++bottom_id) {
+			const int blob_id = AppendBottom(param, layer_id, bottom_id,
+				&available_blobs, &blob_name_to_idx);
+			// If a blob needs backward, this layer should provide it.
+			need_backward |= blob_need_backward_[blob_id];
+		}
+		int num_top = layer_param.top_size();
+		for (int top_id = 0; top_id < num_top; ++top_id) {
+			AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
+		}
+		// If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
+		// specified fewer than the required number (as specified by
+		// ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
+		Layer < Dtype > *layer = layers_[layer_id].get();
+		if (layer->AutoTopBlobs()) {
+			const int needed_num_top =
+				std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
+			for (; num_top < needed_num_top; ++num_top) {
+				// Add "anonymous" top blobs -- do not modify available_blobs or
+				// blob_name_to_idx as we don't want these blobs to be usable as input
+				// to other layers.
+				AppendTop(param, layer_id, num_top, NULL, NULL);
+			}
+		}
+		// After this layer is connected, set it up.
+		LOG(INFO) << "Setting up " << layer_names_[layer_id];
+		layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
+		for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+			if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
+				blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
+			}
+			blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
+			LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
+			if (layer->loss(top_id)) {
+				LOG(INFO) << "    with loss weight " << layer->loss(top_id);
+			}
+			memory_used_ += top_vecs_[layer_id][top_id]->count();
+		}
+		DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+		const int param_size = layer_param.param_size();
+		const int num_param_blobs = layers_[layer_id]->blobs().size();
+		CHECK_LE(param_size, num_param_blobs)
+			<< "Too many params specified for layer " << layer_param.name();
+		ParamSpec default_param_spec;
+		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+			const ParamSpec* param_spec =
+				(param_id < param_size) ?
+																	&layer_param.param(param_id) :
+																	&default_param_spec;
+			const bool param_need_backward = param_spec->lr_mult() > 0;
+			need_backward |= param_need_backward;
+			layers_[layer_id]->set_param_propagate_down(param_id,
+				param_need_backward);
+		}
+		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+			AppendParam(param, layer_id, param_id);
+		}
+		// Finally, set the backward flag
+		layer_need_backward_.push_back(need_backward);
+		if (need_backward) {
+			for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
+				blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
+			}
+		}
+	}
+	// Go through the net backwards to determine which blobs contribute to the
+	// loss.  We can skip backward computation for blobs that don't contribute
+	// to the loss.
+	// Also checks if all bottom blobs don't need backward computation (possible
+	// because the skip_propagate_down param) and so we can skip bacward
+	// computation for the entire layer
+	set < string > blobs_under_loss;
+	set < string > blobs_skip_backp;
+	for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
+		bool layer_contributes_loss = false;
+		bool layer_skip_propagate_down = true;
+		for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+			const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+			if (layers_[layer_id]->loss(top_id) ||
+				(blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
+				layer_contributes_loss = true;
+			}
+			if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
+				layer_skip_propagate_down = false;
+			}
+			if (layer_contributes_loss && !layer_skip_propagate_down)
+				break;
+		}
+		// If this layer can skip backward computation, also all his bottom blobs
+		// don't need backpropagation
+		if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
+			layer_need_backward_[layer_id] = false;
+			for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+				++bottom_id) {
+				bottom_need_backward_[layer_id][bottom_id] = false;
+			}
+		}
+		if (!layer_contributes_loss) {
+			layer_need_backward_[layer_id] = false;
+		}
+		if (layer_need_backward_[layer_id]) {
+			LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
+		} else {
+			LOG(INFO) << layer_names_[layer_id]
+				<< " does not need backward computation.";
+		}
+		for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+			++bottom_id) {
+			if (layer_contributes_loss) {
+				const string& blob_name =
+					blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+				blobs_under_loss.insert(blob_name);
+			} else {
+				bottom_need_backward_[layer_id][bottom_id] = false;
+			}
+			if (!bottom_need_backward_[layer_id][bottom_id]) {
+				const string& blob_name =
+					blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+				blobs_skip_backp.insert(blob_name);
+			}
+		}
+	}
+	// Handle force_backward if needed.
+	if (param.force_backward()) {
+		for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
+			layer_need_backward_[layer_id] = true;
+			for (int bottom_id = 0;
+				bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
+				bottom_need_backward_[layer_id][bottom_id] =
+					bottom_need_backward_[layer_id][bottom_id] ||
+						layers_[layer_id]->AllowForceBackward(bottom_id);
+				blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
+					blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
+						bottom_need_backward_[layer_id][bottom_id];
+			}
+			for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+				++param_id) {
+				layers_[layer_id]->set_param_propagate_down(param_id, true);
+			}
+		}
+	}
+	// In the end, all remaining blobs are considered output blobs.
+	for (set<string>::iterator it = available_blobs.begin();
+		it != available_blobs.end(); ++it) {
+		LOG(INFO) << "This network produces output " << *it;
+		net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
+		net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
+	}
+	for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
+		blob_names_index_[blob_names_[blob_id]] = blob_id;
+	}
+	for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
+		layer_names_index_[layer_names_[layer_id]] = layer_id;
+	}
+	GetLearningRateAndWeightDecay();
+	debug_info_ = param.debug_info();
+	LOG(INFO) << "Network initialization done.";
+	LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+}
+
+template<typename Dtype>
 void Net<Dtype>::FilterNet(const NetParameter& param,
-    NetParameter* param_filtered) {
-  NetState net_state(param.state());
-  param_filtered->CopyFrom(param);
-  param_filtered->clear_layer();
-  for (int i = 0; i < param.layer_size(); ++i) {
-    const LayerParameter& layer_param = param.layer(i);
-    const string& layer_name = layer_param.name();
-    CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
-          << "Specify either include rules or exclude rules; not both.";
-    // If no include rules are specified, the layer is included by default and
-    // only excluded if it meets one of the exclude rules.
-    bool layer_included = (layer_param.include_size() == 0);
-    for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
-      if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
-        layer_included = false;
-      }
-    }
-    for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
-      if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
-        layer_included = true;
-      }
-    }
-    if (layer_included) {
-      param_filtered->add_layer()->CopyFrom(layer_param);
-    }
-  }
-}
-
-template <typename Dtype>
+	NetParameter* param_filtered) {
+	NetState net_state(param.state());
+	param_filtered->CopyFrom(param);
+	param_filtered->clear_layer();
+	for (int i = 0; i < param.layer_size(); ++i) {
+		const LayerParameter& layer_param = param.layer(i);
+		const string& layer_name = layer_param.name();
+		CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
+			<< "Specify either include rules or exclude rules; not both.";
+		// If no include rules are specified, the layer is included by default and
+		// only excluded if it meets one of the exclude rules.
+		bool layer_included = (layer_param.include_size() == 0);
+		for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
+			if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
+				layer_included = false;
+			}
+		}
+		for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
+			if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
+				layer_included = true;
+			}
+		}
+		if (layer_included) {
+			param_filtered->add_layer()->CopyFrom(layer_param);
+		}
+	}
+}
+
+template<typename Dtype>
 bool Net<Dtype>::StateMeetsRule(const NetState& state,
-    const NetStateRule& rule, const string& layer_name) {
-  // Check whether the rule is broken due to phase.
-  if (rule.has_phase()) {
-      if (rule.phase() != state.phase()) {
-        LOG(INFO) << "The NetState phase (" << state.phase()
-          << ") differed from the phase (" << rule.phase()
-          << ") specified by a rule in layer " << layer_name;
-        return false;
-      }
-  }
-  // Check whether the rule is broken due to min level.
-  if (rule.has_min_level()) {
-    if (state.level() < rule.min_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the min_level (" << rule.min_level()
-          << ") specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to max level.
-  if (rule.has_max_level()) {
-    if (state.level() > rule.max_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the max_level (" << rule.max_level()
-          << ") specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to stage. The NetState must
-  // contain ALL of the rule's stages to meet it.
-  for (int i = 0; i < rule.stage_size(); ++i) {
-    // Check that the NetState contains the rule's ith stage.
-    bool has_stage = false;
-    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.stage(i) == state.stage(j)) { has_stage = true; }
-    }
-    if (!has_stage) {
-      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                << "' specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  // Check whether the rule is broken due to not_stage. The NetState must
-  // contain NONE of the rule's not_stages to meet it.
-  for (int i = 0; i < rule.not_stage_size(); ++i) {
-    // Check that the NetState contains the rule's ith not_stage.
-    bool has_stage = false;
-    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
-    }
-    if (has_stage) {
-      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                << "' specified by a rule in layer " << layer_name;
-      return false;
-    }
-  }
-  return true;
+	const NetStateRule& rule, const string& layer_name) {
+	// Check whether the rule is broken due to phase.
+	if (rule.has_phase()) {
+		if (rule.phase() != state.phase()) {
+			LOG(INFO) << "The NetState phase (" << state.phase()
+				<< ") differed from the phase (" << rule.phase()
+				<< ") specified by a rule in layer " << layer_name;
+			return false;
+		}
+	}
+	// Check whether the rule is broken due to min level.
+	if (rule.has_min_level()) {
+		if (state.level() < rule.min_level()) {
+			LOG(INFO) << "The NetState level (" << state.level()
+				<< ") is above the min_level (" << rule.min_level()
+				<< ") specified by a rule in layer " << layer_name;
+			return false;
+		}
+	}
+	// Check whether the rule is broken due to max level.
+	if (rule.has_max_level()) {
+		if (state.level() > rule.max_level()) {
+			LOG(INFO) << "The NetState level (" << state.level()
+				<< ") is above the max_level (" << rule.max_level()
+				<< ") specified by a rule in layer " << layer_name;
+			return false;
+		}
+	}
+	// Check whether the rule is broken due to stage. The NetState must
+	// contain ALL of the rule's stages to meet it.
+	for (int i = 0; i < rule.stage_size(); ++i) {
+		// Check that the NetState contains the rule's ith stage.
+		bool has_stage = false;
+		for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+			if (rule.stage(i) == state.stage(j)) {
+				has_stage = true;
+			}
+		}
+		if (!has_stage) {
+			LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
+				<< "' specified by a rule in layer " << layer_name;
+			return false;
+		}
+	}
+	// Check whether the rule is broken due to not_stage. The NetState must
+	// contain NONE of the rule's not_stages to meet it.
+	for (int i = 0; i < rule.not_stage_size(); ++i) {
+		// Check that the NetState contains the rule's ith not_stage.
+		bool has_stage = false;
+		for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+			if (rule.not_stage(i) == state.stage(j)) {
+				has_stage = true;
+			}
+		}
+		if (has_stage) {
+			LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
+				<< "' specified by a rule in layer " << layer_name;
+			return false;
+		}
+	}
+	return true;
 }
 
 // Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
 // layer_id == -1, tops have layer_id >= 0.)
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
-                           const int top_id, set<string>* available_blobs,
-                           map<string, int>* blob_name_to_idx) {
-  shared_ptr<LayerParameter> layer_param((layer_id >= 0) ?
-    (new LayerParameter(param.layer(layer_id))) : NULL);
-  const string& blob_name = layer_param ?
-      (layer_param->top_size() > top_id ?
-          layer_param->top(top_id) : "(automatic)") : param.input(top_id);
-  // Check if we are doing in-place computation
-  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
-      blob_name == layer_param->bottom(top_id)) {
-    // In-place computation
-    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
-    top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
-    top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
-  } else if (blob_name_to_idx &&
-             blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
-    // If we are not doing in-place computation but have duplicated blobs,
-    // raise an error.
-    LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
-  } else {
-    // Normal output.
-    if (layer_param) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name;
-    } else {
-      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
-    }
-    shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
-    const int blob_id = blobs_.size();
-    blobs_.push_back(blob_pointer);
-    blob_names_.push_back(blob_name);
-    blob_need_backward_.push_back(false);
-    if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; }
-    if (layer_id == -1) {
-      // Set the (explicitly specified) dimensions of the input blob.
-      if (param.input_dim_size() > 0) {
-        blob_pointer->Reshape(param.input_dim(top_id * 4),
-                              param.input_dim(top_id * 4 + 1),
-                              param.input_dim(top_id * 4 + 2),
-                              param.input_dim(top_id * 4 + 3));
-      } else {
-        blob_pointer->Reshape(param.input_shape(top_id));
-      }
-      net_input_blob_indices_.push_back(blob_id);
-      net_input_blobs_.push_back(blob_pointer.get());
-    } else {
-      top_id_vecs_[layer_id].push_back(blob_id);
-      top_vecs_[layer_id].push_back(blob_pointer.get());
-    }
-  }
-  if (available_blobs) { available_blobs->insert(blob_name); }
+	const int top_id, set<string>* available_blobs,
+	map<string, int>* blob_name_to_idx) {
+	shared_ptr < LayerParameter
+		> layer_param(
+			(layer_id >= 0) ?
+												(new LayerParameter(param.layer(layer_id))) :
+												NULL);
+	const string& blob_name =
+		layer_param ?
+									(layer_param->top_size() > top_id ?
+																											layer_param->top(top_id) :
+																											"(automatic)") :
+									param.input(top_id);
+	// Check if we are doing in-place computation
+	if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
+		blob_name == layer_param->bottom(top_id)) {
+		// In-place computation
+		LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
+		top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
+		top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
+	} else if (blob_name_to_idx &&
+		blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
+		// If we are not doing in-place computation but have duplicated blobs,
+		// raise an error.
+		LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
+	} else {
+		// Normal output.
+		if (layer_param) {
+			LOG(INFO) << layer_param->name() << " -> " << blob_name;
+		} else {
+			LOG(INFO) << "Input " << top_id << " -> " << blob_name;
+		}
+		shared_ptr < Blob<Dtype> > blob_pointer(new Blob<Dtype>());
+		const int blob_id = blobs_.size();
+		blobs_.push_back(blob_pointer);
+		blob_names_.push_back(blob_name);
+		blob_need_backward_.push_back(false);
+		if (blob_name_to_idx) {
+			(*blob_name_to_idx)[blob_name] = blob_id;
+		}
+		if (layer_id == -1) {
+			// Set the (explicitly specified) dimensions of the input blob.
+			if (param.input_dim_size() > 0) {
+				blob_pointer->Reshape(param.input_dim(top_id * 4),
+					param.input_dim(top_id * 4 + 1),
+					param.input_dim(top_id * 4 + 2),
+					param.input_dim(top_id * 4 + 3));
+			} else {
+				blob_pointer->Reshape(param.input_shape(top_id));
+			}
+			net_input_blob_indices_.push_back(blob_id);
+			net_input_blobs_.push_back(blob_pointer.get());
+		} else {
+			top_id_vecs_[layer_id].push_back(blob_id);
+			top_vecs_[layer_id].push_back(blob_pointer.get());
+		}
+	}
+	if (available_blobs) {
+		available_blobs->insert(blob_name);
+	}
 }
 
 // Helper for Net::Init: add a new bottom blob to the net.
-template <typename Dtype>
+template<typename Dtype>
 int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
-    const int bottom_id, set<string>* available_blobs,
-    map<string, int>* blob_name_to_idx) {
-  const LayerParameter& layer_param = param.layer(layer_id);
-  const string& blob_name = layer_param.bottom(bottom_id);
-  if (available_blobs->find(blob_name) == available_blobs->end()) {
-    LOG(FATAL) << "Unknown blob input " << blob_name
-               << " (at index " << bottom_id << ") to layer " << layer_id;
-  }
-  const int blob_id = (*blob_name_to_idx)[blob_name];
-  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
-  bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
-  bottom_id_vecs_[layer_id].push_back(blob_id);
-  available_blobs->erase(blob_name);
-  bool propagate_down = true;
-  // Check if the backpropagation on bottom_id should be skipped
-  if (layer_param.propagate_down_size() > 0)
-    propagate_down = layer_param.propagate_down(bottom_id);
-  const bool need_backward = blob_need_backward_[blob_id] &&
-                          propagate_down;
-  bottom_need_backward_[layer_id].push_back(need_backward);
-  return blob_id;
-}
-
-template <typename Dtype>
+	const int bottom_id, set<string>* available_blobs,
+	map<string, int>* blob_name_to_idx) {
+	const LayerParameter& layer_param = param.layer(layer_id);
+	const string& blob_name = layer_param.bottom(bottom_id);
+	if (available_blobs->find(blob_name) == available_blobs->end()) {
+		LOG(FATAL) << "Unknown blob input " << blob_name
+			<< " (at index " << bottom_id << ") to layer " << layer_id;
+	}
+	const int blob_id = (*blob_name_to_idx)[blob_name];
+	LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
+	bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
+	bottom_id_vecs_[layer_id].push_back(blob_id);
+	available_blobs->erase(blob_name);
+	bool propagate_down = true;
+	// Check if the backpropagation on bottom_id should be skipped
+	if (layer_param.propagate_down_size() > 0)
+		propagate_down = layer_param.propagate_down(bottom_id);
+	const bool need_backward = blob_need_backward_[blob_id] &&
+		propagate_down;
+	bottom_need_backward_[layer_id].push_back(need_backward);
+	return blob_id;
+}
+
+template<typename Dtype>
 void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
-                             const int param_id) {
-  const LayerParameter& layer_param = layers_[layer_id]->layer_param();
-  const int param_size = layer_param.param_size();
-  string param_name =
-      (param_size > param_id) ? layer_param.param(param_id).name() : "";
-  if (param_name.size()) {
-    param_display_names_.push_back(param_name);
-  } else {
-    ostringstream param_display_name;
-    param_display_name << param_id;
-    param_display_names_.push_back(param_display_name.str());
-  }
-  const int net_param_id = params_.size();
-  params_.push_back(layers_[layer_id]->blobs()[param_id]);
-  param_id_vecs_[layer_id].push_back(net_param_id);
-  param_layer_indices_.push_back(make_pair(layer_id, param_id));
-  if (!param_size || !param_name.size() || (param_name.size() &&
-      param_names_index_.find(param_name) == param_names_index_.end())) {
-    // This layer "owns" this parameter blob -- it is either anonymous
-    // (i.e., not given a param_name) or explicitly given a name that we
-    // haven't already seen.
-    param_owners_.push_back(-1);
-    if (param_name.size()) {
-      param_names_index_[param_name] = net_param_id;
-    }
-  } else {
-    // Named param blob with name we've seen before: share params
-    const int owner_net_param_id = param_names_index_[param_name];
-    param_owners_.push_back(owner_net_param_id);
-    const pair<int, int>& owner_index =
-        param_layer_indices_[owner_net_param_id];
-    const int owner_layer_id = owner_index.first;
-    const int owner_param_id = owner_index.second;
-    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-              << "layer '" << layer_names_[owner_layer_id] << "', param "
-              << "index " << owner_param_id;
-    Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
-    Blob<Dtype>* owner_blob =
-        layers_[owner_layer_id]->blobs()[owner_param_id].get();
-    const int param_size = layer_param.param_size();
-    if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
-                                  ParamSpec_DimCheckMode_PERMISSIVE)) {
-      // Permissive dimension checking -- only check counts are the same.
-      CHECK_EQ(this_blob->count(), owner_blob->count())
-          << "Shared parameter blobs must have the same count.";
-    } else {
-      // Strict dimension checking -- all dims must be the same.
-      CHECK(this_blob->shape() == owner_blob->shape());
-    }
-    layers_[layer_id]->blobs()[param_id]->ShareData(
-        *layers_[owner_layer_id]->blobs()[owner_param_id]);
-  }
-}
-
-template <typename Dtype>
+	const int param_id) {
+	const LayerParameter& layer_param = layers_[layer_id]->layer_param();
+	const int param_size = layer_param.param_size();
+	string param_name =
+		(param_size > param_id) ? layer_param.param(param_id).name() : "";
+	if (param_name.size()) {
+		param_display_names_.push_back(param_name);
+	} else {
+		ostringstream param_display_name;
+		param_display_name << param_id;
+		param_display_names_.push_back(param_display_name.str());
+	}
+	const int net_param_id = params_.size();
+	params_.push_back(layers_[layer_id]->blobs()[param_id]);
+	param_id_vecs_[layer_id].push_back(net_param_id);
+	param_layer_indices_.push_back(make_pair(layer_id, param_id));
+	if (!param_size || !param_name.size() || (param_name.size() &&
+		param_names_index_.find(param_name) == param_names_index_.end())) {
+		// This layer "owns" this parameter blob -- it is either anonymous
+		// (i.e., not given a param_name) or explicitly given a name that we
+		// haven't already seen.
+		param_owners_.push_back(-1);
+		if (param_name.size()) {
+			param_names_index_[param_name] = net_param_id;
+		}
+	} else {
+		// Named param blob with name we've seen before: share params
+		const int owner_net_param_id = param_names_index_[param_name];
+		param_owners_.push_back(owner_net_param_id);
+		const pair<int, int>& owner_index =
+			param_layer_indices_[owner_net_param_id];
+		const int owner_layer_id = owner_index.first;
+		const int owner_param_id = owner_index.second;
+		LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
+			<< "layer '" << layer_names_[owner_layer_id] << "', param "
+			<< "index " << owner_param_id;
+		Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get();
+		Blob < Dtype > *owner_blob =
+			layers_[owner_layer_id]->blobs()[owner_param_id].get();
+		const int param_size = layer_param.param_size();
+		if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
+			ParamSpec_DimCheckMode_PERMISSIVE)) {
+			// Permissive dimension checking -- only check counts are the same.
+			CHECK_EQ(this_blob->count(), owner_blob->count())
+				<< "Shared parameter blobs must have the same count.";
+		} else {
+			// Strict dimension checking -- all dims must be the same.
+			CHECK(this_blob->shape() == owner_blob->shape());
+		}
+		layers_[layer_id]->blobs()[param_id]->ShareData(
+			*layers_[owner_layer_id]->blobs()[owner_param_id]);
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
-  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
-  ParamSpec default_param_spec;
-  for (int i = 0; i < layers_.size(); ++i) {
-    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
-    for (int j = 0; j < layer_blobs.size(); ++j) {
-      const ParamSpec* param_spec =
-          (layers_[i]->layer_param().param_size() > j) ?
-          &layers_[i]->layer_param().param(j) : &default_param_spec;
-      params_lr_.push_back(param_spec->lr_mult());
-      params_weight_decay_.push_back(param_spec->decay_mult());
-    }
-  }
-}
-
-template <typename Dtype>
+	LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
+	ParamSpec default_param_spec;
+	for (int i = 0; i < layers_.size(); ++i) {
+		vector < shared_ptr<Blob<Dtype> > > &layer_blobs = layers_[i]->blobs();
+		for (int j = 0; j < layer_blobs.size(); ++j) {
+			const ParamSpec* param_spec =
+				(layers_[i]->layer_param().param_size() > j) ?
+					&layers_[i]->layer_param().param(j) : &default_param_spec;
+			params_lr_.push_back(param_spec->lr_mult());
+			params_weight_decay_.push_back(param_spec->decay_mult());
+		}
+	}
+}
+
+template<typename Dtype>
 Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
-  CHECK_GE(start, 0);
-  CHECK_LT(end, layers_.size());
-  Dtype loss = 0;
-  if (debug_info_) {
-    for (int i = 0; i < net_input_blobs_.size(); ++i) {
-      InputDebugInfo(i);
-    }
-  }
-
-  CPUTimer forward_timer;
-  CPUTimer layer_timer;
-  forward_timer.Start();
-
-  for (int i = start; i <= end; ++i) {
-    layer_timer.Start();
-    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
-    loss += layer_loss;
-    if (debug_info_) { ForwardDebugInfo(i); }
-    clFinish(amdDevice.CommandQueue);
-    layer_timer.Stop();
-    printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds());
-  }
-
-  forward_timer.Stop();
-  printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
-
-  return loss;
-}
-
-template <typename Dtype>
+	CHECK_GE(start, 0);
+	CHECK_LT(end, layers_.size());
+	Dtype loss = 0;
+	if (debug_info_) {
+		for (int i = 0; i < net_input_blobs_.size(); ++i) {
+			InputDebugInfo(i);
+		}
+	}
+
+	CPUTimer forward_timer;
+	CPUTimer layer_timer;
+	forward_timer.Start();
+
+	for (int i = start; i <= end; ++i) {
+		layer_timer.Start();
+		Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
+		loss += layer_loss;
+		if (debug_info_) {
+			ForwardDebugInfo(i);
+		}
+		clFinish(amdDevice.CommandQueue);
+		layer_timer.Stop();
+		printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+			layer_timer.MilliSeconds());
+	}
+
+	forward_timer.Stop();
+	printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
+
+	return loss;
+}
+
+template<typename Dtype>
 Dtype Net<Dtype>::ForwardFrom(int start) {
-  return ForwardFromTo(start, layers_.size() - 1);
+	return ForwardFromTo(start, layers_.size() - 1);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype Net<Dtype>::ForwardTo(int end) {
-  return ForwardFromTo(0, end);
+	return ForwardFromTo(0, end);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
-  if (loss != NULL) {
-    *loss = ForwardFromTo(0, layers_.size() - 1);
-  } else {
-    ForwardFromTo(0, layers_.size() - 1);
-  }
-  return net_output_blobs_;
+	if (loss != NULL) {
+		*loss = ForwardFromTo(0, layers_.size() - 1);
+	} else {
+		ForwardFromTo(0, layers_.size() - 1);
+	}
+	return net_output_blobs_;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
-    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
-  // Copy bottom to internal bottom
-  for (int i = 0; i < bottom.size(); ++i) {
-    net_input_blobs_[i]->CopyFrom(*bottom[i]);
-  }
-  return ForwardPrefilled(loss);
+	const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
+	// Copy bottom to internal bottom
+	for (int i = 0; i < bottom.size(); ++i) {
+		net_input_blobs_[i]->CopyFrom(*bottom[i]);
+	}
+	return ForwardPrefilled(loss);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
-  BlobProtoVector blob_proto_vec;
-  if (net_input_blobs_.size()) {
-    blob_proto_vec.ParseFromString(input_blob_protos);
-    CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
-        << "Incorrect input size.";
-    for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
-      net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
-    }
-  }
-  ForwardPrefilled(loss);
-  blob_proto_vec.Clear();
-  for (int i = 0; i < net_output_blobs_.size(); ++i) {
-    net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
-  }
-  string output;
-  blob_proto_vec.SerializeToString(&output);
-  return output;
-}
-
-template <typename Dtype>
+	BlobProtoVector blob_proto_vec;
+	if (net_input_blobs_.size()) {
+		blob_proto_vec.ParseFromString(input_blob_protos);
+		CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
+			<< "Incorrect input size.";
+		for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
+			net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
+		}
+	}
+	ForwardPrefilled(loss);
+	blob_proto_vec.Clear();
+	for (int i = 0; i < net_output_blobs_.size(); ++i) {
+		net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
+	}
+	string output;
+	blob_proto_vec.SerializeToString(&output);
+	return output;
+}
+
+template<typename Dtype>
 void Net<Dtype>::BackwardFromTo(int start, int end) {
-  CHECK_GE(end, 0);
-  CHECK_LT(start, layers_.size());
-  
-  CPUTimer backward_timer;
-  CPUTimer layer_timer;
-  backward_timer.Start();
-
-  for (int i = start; i >= end; --i) {
-    layer_timer.Start();
-    if (layer_need_backward_[i]) {
-      layers_[i]->Backward(
-          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
-      if (debug_info_) { BackwardDebugInfo(i); }
-    clFinish(amdDevice.CommandQueue);
-    layer_timer.Start();
-    printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), layer_timer.MilliSeconds());
-    }
-  }
-
-  backward_timer.Stop();
-  printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
-}
-
-template <typename Dtype>
+	CHECK_GE(end, 0);
+	CHECK_LT(start, layers_.size());
+
+	CPUTimer backward_timer;
+	CPUTimer layer_timer;
+	backward_timer.Start();
+
+	for (int i = start; i >= end; --i) {
+		layer_timer.Start();
+		if (layer_need_backward_[i]) {
+			layers_[i]->Backward(
+				top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
+			if (debug_info_) {
+				BackwardDebugInfo(i);
+			}
+			clFinish(amdDevice.CommandQueue);
+			layer_timer.Start();
+			printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+				layer_timer.MilliSeconds());
+		}
+	}
+
+	backward_timer.Stop();
+	printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
+}
+
+template<typename Dtype>
 void Net<Dtype>::InputDebugInfo(const int input_id) {
-  const Blob<Dtype>& blob = *net_input_blobs_[input_id];
-  const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
-  const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  LOG(INFO) << "    [Forward] "
-     << "Input " << blob_name << " data: " << data_abs_val_mean;
+	const Blob<Dtype>& blob = *net_input_blobs_[input_id];
+	const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
+	const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+	LOG(INFO) << "    [Forward] "
+		<< "Input " << blob_name << " data: " << data_abs_val_mean;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
-  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
-    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-       << " data: " << data_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const int net_param_id = param_id_vecs_[layer_id][param_id];
-    const string& blob_name = param_display_names_[net_param_id];
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-       << " data: " << data_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
+	for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+		const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
+		const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+		LOG(INFO) << "    [Forward] "
+			<< "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
+			<< " data: " << data_abs_val_mean;
+	}
+	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+		++param_id) {
+		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+		const int net_param_id = param_id_vecs_[layer_id][param_id];
+		const string& blob_name = param_display_names_[net_param_id];
+		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+		LOG(INFO) << "    [Forward] "
+			<< "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
+			<< " data: " << data_abs_val_mean;
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
-  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
-  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
-    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
-    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
-    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-        << " diff: " << diff_abs_val_mean;
-  }
-  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
-    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
+	const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
+	for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
+		if (!bottom_need_backward_[layer_id][bottom_id]) {
+			continue;
+		}
+		const Blob<Dtype>& blob = *bottom_vec[bottom_id];
+		const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+		LOG(INFO) << "    [Backward] "
+			<< "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+			<< " diff: " << diff_abs_val_mean;
+	}
+	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+		++param_id) {
+		if (!layers_[layer_id]->param_propagate_down(param_id)) {
+			continue;
+		}
+		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+		LOG(INFO) << "    [Backward] "
+			<< "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+			<< " diff: " << diff_abs_val_mean;
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::UpdateDebugInfo(const int param_id) {
-  const Blob<Dtype>& blob = *params_[param_id];
-  const int param_owner = param_owners_[param_id];
-  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
-  const string& param_display_name = param_display_names_[param_id];
-  const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-  if (param_owner < 0) {
-    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param " << param_display_name
-        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
-  } else {
-    const string& owner_layer_name =
-        layer_names_[param_layer_indices_[param_owner].first];
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param blob " << param_display_name
-        << " (owned by layer " << owner_layer_name << ", "
-        << "param " << param_display_names_[param_owners_[param_id]] << ")"
-        << " diff: " << diff_abs_val_mean;
-  }
-}
-
-template <typename Dtype>
+	const Blob<Dtype>& blob = *params_[param_id];
+	const int param_owner = param_owners_[param_id];
+	const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
+	const string& param_display_name = param_display_names_[param_id];
+	const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+	if (param_owner < 0) {
+		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+		LOG(INFO) << "    [Update] Layer " << layer_name
+			<< ", param " << param_display_name
+			<< " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
+	} else {
+		const string& owner_layer_name =
+			layer_names_[param_layer_indices_[param_owner].first];
+		LOG(INFO) << "    [Update] Layer " << layer_name
+			<< ", param blob " << param_display_name
+			<< " (owned by layer " << owner_layer_name << ", "
+			<< "param " << param_display_names_[param_owners_[param_id]] << ")"
+			<< " diff: " << diff_abs_val_mean;
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
-  int num_source_layers = other->layers().size();
-  for (int i = 0; i < num_source_layers; ++i) {
-    Layer<Dtype>* source_layer = other->layers()[i].get();
-    const string& source_layer_name = other->layer_names()[i];
-    int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
-      ++target_layer_id;
-    }
-    if (target_layer_id == layer_names_.size()) {
-      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-      continue;
-    }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-        layers_[target_layer_id]->blobs();
-    CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
-        << "Incompatible number of blobs for layer " << source_layer_name;
-    for (int j = 0; j < target_blobs.size(); ++j) {
-      Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
-      CHECK(target_blobs[j]->shape() == source_blob->shape());
-      target_blobs[j]->ShareData(*source_blob);
-    }
-  }
-}
-
-template <typename Dtype>
+	int num_source_layers = other->layers().size();
+	for (int i = 0; i < num_source_layers; ++i) {
+		Layer < Dtype > *source_layer = other->layers()[i].get();
+		const string& source_layer_name = other->layer_names()[i];
+		int target_layer_id = 0;
+		while (target_layer_id != layer_names_.size() &&
+			layer_names_[target_layer_id] != source_layer_name) {
+			++target_layer_id;
+		}
+		if (target_layer_id == layer_names_.size()) {
+			DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+			continue;
+		}
+		DLOG(INFO) << "Copying source layer " << source_layer_name;
+		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
+			layers_[target_layer_id]->blobs();
+		CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
+			<< "Incompatible number of blobs for layer " << source_layer_name;
+		for (int j = 0; j < target_blobs.size(); ++j) {
+			Blob < Dtype > *source_blob = source_layer->blobs()[j].get();
+			CHECK(target_blobs[j]->shape() == source_blob->shape());
+			target_blobs[j]->ShareData(*source_blob);
+		}
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::BackwardFrom(int start) {
-  BackwardFromTo(start, 0);
+	BackwardFromTo(start, 0);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::BackwardTo(int end) {
-  BackwardFromTo(layers_.size() - 1, end);
+	BackwardFromTo(layers_.size() - 1, end);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::Backward() {
-  BackwardFromTo(layers_.size() - 1, 0);
-  if (debug_info_) {
-    Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
-    for (int i = 0; i < params_.size(); ++i) {
-      if (param_owners_[i] >= 0) { continue; }
-      asum_data += params_[i]->asum_data();
-      asum_diff += params_[i]->asum_diff();
-      sumsq_data += params_[i]->sumsq_data();
-      sumsq_diff += params_[i]->sumsq_diff();
-    }
-    const Dtype l2norm_data = std::sqrt(sumsq_data);
-    const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-    LOG(ERROR) << "    [Backward] All net params (data, diff): "
-        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
-  }
-}
-
-template <typename Dtype>
+	BackwardFromTo(layers_.size() - 1, 0);
+	if (debug_info_) {
+		Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
+		for (int i = 0; i < params_.size(); ++i) {
+			if (param_owners_[i] >= 0) {
+				continue;
+			}
+			asum_data += params_[i]->asum_data();
+			asum_diff += params_[i]->asum_diff();
+			sumsq_data += params_[i]->sumsq_data();
+			sumsq_diff += params_[i]->sumsq_diff();
+		}
+		const Dtype l2norm_data = std::sqrt(sumsq_data);
+		const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+		LOG(ERROR) << "    [Backward] All net params (data, diff): "
+			<< "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+			<< "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::Reshape() {
-  for (int i = 0; i < layers_.size(); ++i) {
-    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
-  }
+	for (int i = 0; i < layers_.size(); ++i) {
+		layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
-  int num_source_layers = param.layer_size();
-  for (int i = 0; i < num_source_layers; ++i) {
-    const LayerParameter& source_layer = param.layer(i);
-    const string& source_layer_name = source_layer.name();
-    int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
-      ++target_layer_id;
-    }
-    if (target_layer_id == layer_names_.size()) {
-      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-      continue;
-    }
-    DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
-        layers_[target_layer_id]->blobs();
-    CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
-        << "Incompatible number of blobs for layer " << source_layer_name;
-    for (int j = 0; j < target_blobs.size(); ++j) {
-      const bool kReshape = false;
-      target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
-    }
-  }
-}
-
-template <typename Dtype>
+	int num_source_layers = param.layer_size();
+	for (int i = 0; i < num_source_layers; ++i) {
+		const LayerParameter& source_layer = param.layer(i);
+		const string& source_layer_name = source_layer.name();
+		int target_layer_id = 0;
+		while (target_layer_id != layer_names_.size() &&
+			layer_names_[target_layer_id] != source_layer_name) {
+			++target_layer_id;
+		}
+		if (target_layer_id == layer_names_.size()) {
+			DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+			continue;
+		}
+		DLOG(INFO) << "Copying source layer " << source_layer_name;
+		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
+			layers_[target_layer_id]->blobs();
+		CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
+			<< "Incompatible number of blobs for layer " << source_layer_name;
+		for (int j = 0; j < target_blobs.size(); ++j) {
+			const bool kReshape = false;
+			target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
+		}
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  NetParameter param;
-  ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
-  CopyTrainedLayersFrom(param);
+	NetParameter param;
+	ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
+	CopyTrainedLayersFrom(param);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
-  param->Clear();
-  param->set_name(name_);
-  // Add bottom and top
-  for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
-    param->add_input(blob_names_[net_input_blob_indices_[i]]);
-  }
-  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
-  for (int i = 0; i < layers_.size(); ++i) {
-    LayerParameter* layer_param = param->add_layer();
-    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
-      layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
-    }
-    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
-      layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
-    }
-    layers_[i]->ToProto(layer_param, write_diff);
-  }
-}
-
-template <typename Dtype>
+	param->Clear();
+	param->set_name(name_);
+	// Add bottom and top
+	for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
+		param->add_input(blob_names_[net_input_blob_indices_[i]]);
+	}
+	DLOG(INFO) << "Serializing " << layers_.size() << " layers";
+	for (int i = 0; i < layers_.size(); ++i) {
+		LayerParameter* layer_param = param->add_layer();
+		for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
+			layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
+		}
+		for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
+			layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
+		}
+		layers_[i]->ToProto(layer_param, write_diff);
+	}
+}
+
+template<typename Dtype>
 void Net<Dtype>::Update() {
-  // First, accumulate the diffs of any shared parameters into their owner's
-  // diff. (Assumes that the learning rate, weight decay, etc. have already been
-  // accounted for in the current diff.)
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    const int count = params_[i]->count();
-    const Dtype* this_diff;
-    Dtype* owner_diff;
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();    
-
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
-      break;
-    case Caffe::GPU:
-#ifndef CPU_ONLY
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-     // caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-      caffe_gpu_axpy<Dtype>(count, 1.0, this_diff, owner_diff);
+	// First, accumulate the diffs of any shared parameters into their owner's
+	// diff. (Assumes that the learning rate, weight decay, etc. have already been
+	// accounted for in the current diff.)
+	for (int i = 0; i < params_.size(); ++i) {
+		if (param_owners_[i] < 0) {
+			continue;
+		}
+		if (debug_info_) {
+			UpdateDebugInfo(i);
+		}
+		const int count = params_[i]->count();
+		const Dtype* this_diff;
+		Dtype* owner_diff;
+		this_diff = params_[i]->cpu_diff();
+		owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+
+		switch (Caffe::mode()) {
+			case Caffe::CPU:
+				this_diff = params_[i]->cpu_diff();
+				owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+				caffe_add(count, this_diff, owner_diff, owner_diff);
+				break;
+			case Caffe::GPU:
+				#ifndef CPU_ONLY
+				this_diff = params_[i]->gpu_diff();
+				owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
+				// caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+				caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff);
 #else
-      NO_GPU;
+				NO_GPU;
 #endif
-      break;
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
-  }
-  // Now, update the owned parameters.
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] >= 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    params_[i]->Update();
-  }
-}
-
-template <typename Dtype>
- bool Net<Dtype>::has_blob(const string& blob_name) const {
-  return blob_names_index_.find(blob_name) != blob_names_index_.end();
-}
-
-template <typename Dtype>
+				break;
+			default:
+				LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+		}
+	}
+	// Now, update the owned parameters.
+	for (int i = 0; i < params_.size(); ++i) {
+		if (param_owners_[i] >= 0) {
+			continue;
+		}
+		if (debug_info_) {
+			UpdateDebugInfo(i);
+		}
+		params_[i]->Update();
+	}
+}
+
+template<typename Dtype>
+bool Net<Dtype>::has_blob(const string& blob_name) const {
+	return blob_names_index_.find(blob_name) != blob_names_index_.end();
+}
+
+template<typename Dtype>
 const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
-    const string& blob_name) const {
-  shared_ptr<Blob<Dtype> > blob_ptr;
-  if (has_blob(blob_name)) {
-    blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
-  } else {
-    blob_ptr.reset((Blob<Dtype>*)(NULL));
-    LOG(WARNING) << "Unknown blob name " << blob_name;
-  }
-  return blob_ptr;
-}
-
-template <typename Dtype>
+	const string& blob_name) const {
+	shared_ptr < Blob<Dtype> > blob_ptr;
+	if (has_blob(blob_name)) {
+		blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
+	} else {
+		blob_ptr.reset((Blob<Dtype>*) (NULL));
+		LOG(WARNING) << "Unknown blob name " << blob_name;
+	}
+	return blob_ptr;
+}
+
+template<typename Dtype>
 bool Net<Dtype>::has_layer(const string& layer_name) const {
-  return layer_names_index_.find(layer_name) != layer_names_index_.end();
+	return layer_names_index_.find(layer_name) != layer_names_index_.end();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
-    const string& layer_name) const {
-  shared_ptr<Layer<Dtype> > layer_ptr;
-  if (has_layer(layer_name)) {
-    layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
-  } else {
-    layer_ptr.reset((Layer<Dtype>*)(NULL));
-    LOG(WARNING) << "Unknown layer name " << layer_name;
-  }
-  return layer_ptr;
-}
-
-INSTANTIATE_CLASS(Net);
+	const string& layer_name) const {
+	shared_ptr < Layer<Dtype> > layer_ptr;
+	if (has_layer(layer_name)) {
+		layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
+	} else {
+		layer_ptr.reset((Layer<Dtype>*) (NULL));
+		LOG(WARNING) << "Unknown layer name " << layer_name;
+	}
+	return layer_ptr;
+}
+
+INSTANTIATE_CLASS (Net);
 
 }  // namespace caffe
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
index 03ddba8a..5da76b7e 100644
--- a/src/caffe/ocl/bnll_layer.cl
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -28,25 +28,25 @@
 
 template <class T>
 __kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
-  int index = get_global_id(0);
-  if (index < n) {
-    out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
-  }
+	int index = get_global_id(0);
+	if (index < n) {
+		out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
+	}
 }
 template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out);
 template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out);
 
 template <class T>
 __kernel void BNLLBackward(const int n, __global const T* in_diff,
-    __global const T* in_data, __global T* out_diff) {
-    int index = get_global_id(0);
-    if (index < n) {
-      T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
-      out_diff[index] = in_diff[index] * expval / (expval + 1.);
-  }
+	__global const T* in_data, __global T* out_diff) {
+	int index = get_global_id(0);
+	if (index < n) {
+		T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
+		out_diff[index] = in_diff[index] * expval / (expval + 1.);
+	}
 }
 
 template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff,
-    __global const float* in_data, __global float* out_diff);
+	__global const float* in_data, __global float* out_diff);
 template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff,
-    __global const double* in_data, __global double* out_diff);
+	__global const double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
index 71eb8c77..de504dec 100644
--- a/src/caffe/ocl/concat_layer.cl
+++ b/src/caffe/ocl/concat_layer.cl
@@ -26,29 +26,29 @@
 
 template <class T>
 __kernel void Concat(const int nthreads, __global const T* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global T* out_data) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-        const int total_concat_size = concat_size * bottom_concat_axis;
-        const int concat_num = index / total_concat_size;
-        const int concat_index = index % total_concat_size;
-        const int top_index = concat_index +
-            (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-        if (forward) {
-            out_data[top_index] = in_data[index];
-        } else {
-            out_data[index] = in_data[top_index];
-        }
-    }
+	const bool forward, const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, __global T* out_data) {
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		const int total_concat_size = concat_size * bottom_concat_axis;
+		const int concat_num = index / total_concat_size;
+		const int concat_index = index % total_concat_size;
+		const int top_index = concat_index +
+		(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+		if (forward) {
+			out_data[top_index] = in_data[index];
+		} else {
+			out_data[index] = in_data[top_index];
+		}
+	}
 }
 
-template __attribute__((mangled_name(Concat_float))) __kernel void  Concat(const int nthreads, __global const float* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global float* out_data);
-template __attribute__((mangled_name(Concat_double))) __kernel void  Concat(const int nthreads, __global const double* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global double* out_data);
+template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data,
+	const bool forward, const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, __global float* out_data);
+template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data,
+	const bool forward, const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
index 8ed18ce4..0aeea80c 100644
--- a/src/caffe/ocl/contrastive_loss_layer.cl
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -26,39 +26,39 @@
 
 template <class Dtype>
 __kernel void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
-    __global Dtype *bottom_diff) {
-    int i = get_global_id(0);
-    if(i < count) {
-        int n = i / channels;  // the num index, to access y and dist_sq
-        if (static_cast<int>(y[n])) {  // similar pairs
-            bottom_diff[i] = alpha * diff[i];
-        } else {  // dissimilar pairs
-            Dtype mdist(0.0);
-            Dtype beta(0.0);
-            if (legacy_version) {
-                mdist = (margin - dist_sq[n]);
-                beta = -alpha;
-            } else {
-                Dtype dist = sqrt(dist_sq[n]);
-                mdist = (margin - dist);
-                beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-            }
-            if (mdist > 0.0) {
-                bottom_diff[i] = beta;
-            } else {
-                bottom_diff[i] = 0;
-            }
-       }
-   }
+	const Dtype margin, const bool legacy_version, const Dtype alpha,
+	__global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
+	__global Dtype *bottom_diff) {
+	int i = get_global_id(0);
+	if(i < count) {
+		int n = i / channels;  // the num index, to access y and dist_sq
+		if (static_cast<int>(y[n])) {  // similar pairs
+			bottom_diff[i] = alpha * diff[i];
+		} else {  // dissimilar pairs
+			Dtype mdist(0.0);
+			Dtype beta(0.0);
+			if (legacy_version) {
+				mdist = (margin - dist_sq[n]);
+				beta = -alpha;
+			} else {
+				Dtype dist = sqrt(dist_sq[n]);
+				mdist = (margin - dist);
+				beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+			}
+			if (mdist > 0.0) {
+				bottom_diff[i] = beta;
+			} else {
+				bottom_diff[i] = 0;
+			}
+		}
+	}
 }
 
 template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels,
-    const float margin, const bool legacy_version, const float alpha,
-    __global const float* y, __global const float* diff, __global const float* dist_sq,
-    __global float *bottom_diff);
+	const float margin, const bool legacy_version, const float alpha,
+	__global const float* y, __global const float* diff, __global const float* dist_sq,
+	__global float *bottom_diff);
 template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels,
-    const double margin, const bool legacy_version, const double alpha,
-    __global const double* y, __global const double* diff, __global const double* dist_sq,
-    __global double *bottom_diff);
+	const double margin, const bool legacy_version, const double alpha,
+	__global const double* y, __global const double* diff, __global const double* dist_sq,
+	__global double *bottom_diff);
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
index 4bfa39bc..bb2fc696 100644
--- a/src/caffe/ocl/dropout_layer.cl
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -25,20 +25,19 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out){
-    int index = get_global_id(0);
-    if (index < n)
-        out[index] = in[index] * scale * mask[index];
+__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) {
+	int index = get_global_id(0);
+	if (index < n)
+	out[index] = in[index] * scale * mask[index];
 }
-template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in,  __global const int* mask, const float scale, __global float* out); 
+template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out);
 template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
 
-
 template <class T>
-__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff){
-    int index = get_global_id(0);
-    if (index < n)
-        out_diff[index] = in_diff[index] * scale * mask[index];
+__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) {
+	int index = get_global_id(0);
+	if (index < n)
+	out_diff[index] = in_diff[index] * scale * mask[index];
 }
-template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff,  __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff); 
+template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff);
 template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
index d843884a..0e1812d8 100644
--- a/src/caffe/ocl/eltwise_layer.cl
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -26,48 +26,48 @@
 
 template <class Dtype>
 __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
-    __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
-    __global int* mask) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    if (bottom_data_a[index] > bottom_data_b[index]) {
-      // only update for very first bottom_data blob (blob_idx == 0)
-      if (blob_idx == 0) {
-        maxval = bottom_data_a[index];
-        top_data[index] = maxval;
-        maxidx = blob_idx;
-        mask[index] = maxidx;
-      }
-    } else {
-      maxval = bottom_data_b[index];
-      top_data[index] = maxval;
-      maxidx = blob_idx + 1;
-      mask[index] = maxidx;
-    }
-  }
+	__global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
+	__global int* mask) {
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		Dtype maxval = -FLT_MAX;
+		int maxidx = -1;
+		if (bottom_data_a[index] > bottom_data_b[index]) {
+			// only update for very first bottom_data blob (blob_idx == 0)
+			if (blob_idx == 0) {
+				maxval = bottom_data_a[index];
+				top_data[index] = maxval;
+				maxidx = blob_idx;
+				mask[index] = maxidx;
+			}
+		} else {
+			maxval = bottom_data_b[index];
+			top_data[index] = maxval;
+			maxidx = blob_idx + 1;
+			mask[index] = maxidx;
+		}
+	}
 }
 template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a,
-    __global const float* bottom_data_b, const int blob_idx, __global float* top_data,
-    __global int* mask);
+	__global const float* bottom_data_b, const int blob_idx, __global float* top_data,
+	__global int* mask);
 template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a,
-    __global const double* bottom_data_b, const int blob_idx, __global double* top_data,
-    __global int* mask);
+	__global const double* bottom_data_b, const int blob_idx, __global double* top_data,
+	__global int* mask);
 
 template <class Dtype>
 __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
-    const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-        Dtype gradient = 0;
-        if (mask[index] == blob_idx) {
-            gradient += top_diff[index];
-        }
-        bottom_diff[index] = gradient;
-    }
+	const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		Dtype gradient = 0;
+		if (mask[index] == blob_idx) {
+			gradient += top_diff[index];
+		}
+		bottom_diff[index] = gradient;
+	}
 }
 template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff,
-    const int blob_idx, __global const int* mask, __global float* bottom_diff);
+	const int blob_idx, __global const int* mask, __global float* bottom_diff);
 template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff,
-    const int blob_idx, __global const int* mask, __global double* bottom_diff);
+	const int blob_idx, __global const int* mask, __global double* bottom_diff);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 3e535d5f..c08d1310 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -25,267 +25,266 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset){
-    int index=get_global_id(0);
-    data_im = data_im + img_offset;
-    data_col =  data_col + col_offset;
-    if(index < n){
-        int w_out=index %width_col;
-        index /= width_col;
-        int h_out=index%height_col;
-        int channel_in = index/height_col;
-        int channel_out=channel_in *ksize *ksize;
-        int h_in = h_out *stride-pad;
-        int w_in = w_out *stride-pad;
-        data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-        data_im +=(channel_in * height + h_in) *width + w_in;
-        int i=0,j=0;
-        for(i=0;i<ksize;++i){
-            for(j=0;j<ksize;++j){
-                int h = h_in+i;
-                int w = w_in+j;
-                if(h >= 0 && w >= 0 && h < height && w < width)
-                    *data_col=data_im[i * width + j];
-                else *data_col=0;
-                data_col +=height_col *width_col;
-            }
-        }
-    }
+__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) {
+	int index=get_global_id(0);
+	data_im = data_im + img_offset;
+	data_col = data_col + col_offset;
+	if(index < n) {
+		int w_out=index %width_col;
+		index /= width_col;
+		int h_out=index%height_col;
+		int channel_in = index/height_col;
+		int channel_out=channel_in *ksize *ksize;
+		int h_in = h_out *stride-pad;
+		int w_in = w_out *stride-pad;
+		data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+		data_im +=(channel_in * height + h_in) *width + w_in;
+		int i=0,j=0;
+		for(i=0;i<ksize;++i) {
+			for(j=0;j<ksize;++j) {
+				int h = h_in+i;
+				int w = w_in+j;
+				if(h >= 0 && w >= 0 && h < height && w < width)
+				*data_col=data_im[i * width + j];
+				else *data_col=0;
+				data_col +=height_col *width_col;
+			}
+		}
+	}
 }
 
-template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset); 
-template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset); 
+template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset);
+template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset);
 
 template <class T>
-__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum){
+__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
 
-    int index = get_global_id(0);
+	int index = get_global_id(0);
 
-    data_im = data_im + img_offset;
-    data_col = data_col + col_offset;
+	data_im = data_im + img_offset;
+	data_col = data_col + col_offset;
 
-    int x_out = index % width_col;
-    int y_out = (index / width_col) % height_col;
-    int channel_in = (index / width_col / height_col) % channels;
-    int channel_out = channel_in * ksize * ksize;
-    int im_id = index / width_col / height_col / channels;
+	int x_out = index % width_col;
+	int y_out = (index / width_col) % height_col;
+	int channel_in = (index / width_col / height_col) % channels;
+	int channel_out = channel_in * ksize * ksize;
+	int im_id = index / width_col / height_col / channels;
 
-    int y_in = y_out * stride - pad;
-    int x_in = x_out * stride - pad;
-    int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
-    int offset_im = im_id * channels * height * width + channel_in * height * width;
+	int y_in = y_out * stride - pad;
+	int x_in = x_out * stride - pad;
+	int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
+	int offset_im = im_id * channels * height * width + channel_in * height * width;
 
-    for(int k_h = 0; k_h < ksize; k_h++){
-        for(int k_w = 0; k_w < ksize; k_w++){
-            int x_im = x_in + k_w;
-            int y_im = y_in + k_h;
-            int index_im = y_im * width + x_im;
-            int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
-            if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
-                data_col[offset_col + index_col] = data_im[offset_im + index_im];
-            else
-                data_col[offset_col + index_col] = 0;
-        }
-    }
+	for(int k_h = 0; k_h < ksize; k_h++) {
+		for(int k_w = 0; k_w < ksize; k_w++) {
+			int x_im = x_in + k_w;
+			int y_im = y_in + k_h;
+			int index_im = y_im * width + x_im;
+			int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+			if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
+			data_col[offset_col + index_col] = data_im[offset_im + index_im];
+			else
+			data_col[offset_col + index_col] = 0;
+		}
+	}
 }
 
-template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); 
-template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); 
-
+template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum);
+template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum);
 
 template <class T>
 __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    __global T* data_col, const int col_offset) {
-    data_im = data_im + img_offset;
-    data_col = data_col + col_offset;     
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int height_col, const int width_col,
+	__global T* data_col, const int col_offset) {
+	data_im = data_im + img_offset;
+	data_col = data_col + col_offset;
 
-    int index = get_global_id(0);
-    if(index < n) {
-        int w_out = index % width_col;
-        int h_index = index / width_col;
-        int h_out = h_index % height_col;
-        int channel_in = h_index / height_col;
-        int channel_out = channel_in * kernel_h * kernel_w;
-        int h_in = h_out * stride_h - pad_h;
-        int w_in = w_out * stride_w - pad_w;
-        __global T* data_col_ptr = data_col;
-        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-        __global const T* data_im_ptr = data_im;
-        data_im_ptr += (channel_in * height + h_in) * width + w_in;
-        for (int i = 0; i < kernel_h; ++i) {
-            for (int j = 0; j < kernel_w; ++j) {
-                int h = h_in + i;
-                int w = w_in + j;
-                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-                            data_im_ptr[i * width + j] : 0;
-                data_col_ptr += height_col * width_col;
-        }
-    }
-  }
+	int index = get_global_id(0);
+	if(index < n) {
+		int w_out = index % width_col;
+		int h_index = index / width_col;
+		int h_out = h_index % height_col;
+		int channel_in = h_index / height_col;
+		int channel_out = channel_in * kernel_h * kernel_w;
+		int h_in = h_out * stride_h - pad_h;
+		int w_in = w_out * stride_w - pad_w;
+		__global T* data_col_ptr = data_col;
+		data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+		__global const T* data_im_ptr = data_im;
+		data_im_ptr += (channel_in * height + h_in) * width + w_in;
+		for (int i = 0; i < kernel_h; ++i) {
+			for (int j = 0; j < kernel_w; ++j) {
+				int h = h_in + i;
+				int w = w_in + j;
+				*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+				data_im_ptr[i * width + j] : 0;
+				data_col_ptr += height_col * width_col;
+			}
+		}
+	}
 }
 
 template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
-           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-           const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2col_gpu_kernel_double)))  void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
-           const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-           const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-           const int height_col, const int width_col, __global double* data_col, const int col_offset);
+	const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	const int height_col, const int width_col, __global float* data_col, const int col_offset);
+template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+	const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	const int height_col, const int width_col, __global double* data_col, const int col_offset);
 
 template <class T>
 __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    __global T* data_im, const int img_offset) {
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-   int index = get_global_id(0);
-    if(index < n) {
-        T val = 0;
-        int w = index % width + pad_w;
-        int h = (index / width) % height + pad_h;
-        int c = index / (width * height);
-        // compute the start and end of the output
-        int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-        int w_col_end = min(w / stride_w + 1, width_col);
-        int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-        int h_col_end = min(h / stride_h + 1, height_col);
-        // equivalent implementation
-        int offset =
-            (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-        int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-        int coeff_w_col = (1 - stride_w * height_col * width_col);
-        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-            for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-            }
-        }
-        data_im[index] = val;
-  }
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int height_col, const int width_col,
+	__global T* data_im, const int img_offset) {
+	data_col = data_col + col_offset;
+	data_im = data_im + img_offset;
+	int index = get_global_id(0);
+	if(index < n) {
+		T val = 0;
+		int w = index % width + pad_w;
+		int h = (index / width) % height + pad_h;
+		int c = index / (width * height);
+		// compute the start and end of the output
+		int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+		int w_col_end = min(w / stride_w + 1, width_col);
+		int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+		int h_col_end = min(h / stride_h + 1, height_col);
+		// equivalent implementation
+		int offset =
+		(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+		int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+		int coeff_w_col = (1 - stride_w * height_col * width_col);
+		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+			}
+		}
+		data_im[index] = val;
+	}
 }
 
 template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
-    									const int height, const int width, const int channels,
-    									const int patch_h, const int patch_w,const int pad_h, const int pad_w,
-    									const int stride_h, const int stride_w,const int height_col, const int width_col,
-    									__global float* data_im, const int img_offset);
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,const int height_col, const int width_col,
+	__global float* data_im, const int img_offset);
 template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
-                                         const int col_offset, const int height, const int width, const int channels,
-                                         const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-                                         const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+	const int col_offset, const int height, const int width, const int channels,
+	const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
-__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset){
-    int index = get_global_id(0);
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-    if(index < n){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height);
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
+__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) {
+	int index = get_global_id(0);
+	data_col = data_col + col_offset;
+	data_im = data_im + img_offset;
+	if(index < n) {
+		T val = 0;
+		int w = index % width + pad;
+		int h = (index / width) % height + pad;
+		int c = index / (width * height);
+		// compute the start and end of the output
+		int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+		int w_col_end = min(w / stride + 1, width_col);
+		int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+		int h_col_end = min(h / stride + 1, height_col);
+		// equivalent implementation
+		int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+		int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+		int coeff_w_col = (1 - stride * height_col * width_col);
+		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+			}
+		}
+		data_im[index] = val;
+	}
 }
-template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset); 
-template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset); 
+template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset);
+template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
-__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum){
-    int index = get_global_id(0);
-    data_col = data_col + col_offset;
-    data_im = data_im + img_offset;
-    if(index < n){
-      T val = 0;
-      int w = index % width + pad;
-      int h = (index / width) % height + pad;
-      int c = index / (width * height) % channels;
-      int im = index / width / height / channels;
-      // compute the start and end of the output
-      int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-      int w_col_end = min(w / stride + 1, width_col);
-      int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-      int h_col_end = min(h / stride + 1, height_col);
-      // equivalent implementation
-      int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
-      int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
-      int coeff_w_col = (1 - stride * height_col * width_col * optnum);
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-        }
-      }
-      data_im[index] = val;
-  }
+__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
+	int index = get_global_id(0);
+	data_col = data_col + col_offset;
+	data_im = data_im + img_offset;
+	if(index < n) {
+		T val = 0;
+		int w = index % width + pad;
+		int h = (index / width) % height + pad;
+		int c = index / (width * height) % channels;
+		int im = index / width / height / channels;
+		// compute the start and end of the output
+		int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+		int w_col_end = min(w / stride + 1, width_col);
+		int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+		int h_col_end = min(h / stride + 1, height_col);
+		// equivalent implementation
+		int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
+		int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
+		int coeff_w_col = (1 - stride * height_col * width_col * optnum);
+		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+			}
+		}
+		data_im[index] = val;
+	}
 }
-template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); 
-template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); 
+template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum);
+template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum);
 
 template <class T>
-__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum){
+__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) {
 
-    int index = get_global_id(0);
-    data_opt = data_opt + opt_offset;
-    data_im = data_im + im_offset;
-    if(index < n){
-      int w = index % width;
-      int h = (index / width) % height;
-      int c = index / (width * height) % channels;
-      int im = index / width / height / channels;
+	int index = get_global_id(0);
+	data_opt = data_opt + opt_offset;
+	data_im = data_im + im_offset;
+	if(index < n) {
+		int w = index % width;
+		int h = (index / width) % height;
+		int c = index / (width * height) % channels;
+		int im = index / width / height / channels;
 
-      int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
-      data_opt[opt_index] = data_im[index];
-    }
+		int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
+		data_opt[opt_index] = data_im[index];
+	}
 }
-template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); 
-template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); 
+template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum);
+template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum);
 
 template <class T>
-__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int gidy = get_global_id(1);
-     int gidyy = gidy;
-     int index = gidy / height;
-     int offset = index * width * height;
-     gidy = gidy % height;
-     if( gidx < width && gidyy < height * optnum )
-         dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) {
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int gidyy = gidy;
+	int index = gidy / height;
+	int offset = index * width * height;
+	gidy = gidy % height;
+	if( gidx < width && gidyy < height * optnum )
+	dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
 }
-template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); 
+template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum);
 template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
 
 template <class T>
-__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum){
-     int gidx = get_global_id(0);
-     int index;
-     index = (optnum==1) ? 0: gidx % optnum;
-     dst = dst + top_offset; // now we point at (*top)[n]
-     int offset = gidx / optnum;
-     int i = 0;
-     for(i = 0 ; i < width; i++)
-         dst[(index * height + offset)* width + i] = src[gidx * width + i];
+__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) {
+	int gidx = get_global_id(0);
+	int index;
+	index = (optnum==1) ? 0: gidx % optnum;
+	dst = dst + top_offset; // now we point at (*top)[n]
+	int offset = gidx / optnum;
+	int i = 0;
+	for(i = 0; i < width; i++)
+	dst[(index * height + offset)* width + i] = src[gidx * width + i];
 }
-template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); 
-template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); 
+template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum);
+template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum);
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
index ae1c9269..620bad72 100644
--- a/src/caffe/ocl/lrn_layer.cl
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -26,113 +26,113 @@
 
 template <class T>
 __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) 
-    out[index] = in[index] * pow(scale[index], negative_beta);
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index += tmp)
+	out[index] = in[index] * pow(scale[index], negative_beta);
 }
 template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
 template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
 
 template <class T>
-__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k,  __global T* scale) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    in = in + offset;
-    scale = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    T accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in[head * step] * in[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in[(head - size) * step]
-                       * in[(head - size) * step];
-      }
-      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in[(head - size) * step]
-                       * in[(head - size) * step];
-      }
-      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index += tmp) {
+		// find out the local offset
+		const int w = index % width;
+		const int h = (index / width) % height;
+		const int n = index / width / height;
+		const int offset = (n * channels * height + h) * width + w;
+		const int step = height * width;
+		in = in + offset;
+		scale = scale + offset;
+		int head = 0;
+		const int pre_pad = (size - 1) / 2;
+		const int post_pad = size - pre_pad - 1;
+		T accum_scale = 0;
+		// fill the scale at [n, :, h, w]
+		// accumulate values
+		while (head < post_pad && head < channels) {
+			accum_scale += in[head * step] * in[head * step];
+			++head;
+		}
+		// both add and subtract
+		while (head < channels) {
+			accum_scale += in[head * step] * in[head * step];
+			if (head - size >= 0) {
+				accum_scale -= in[(head - size) * step]
+				* in[(head - size) * step];
+			}
+			scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+			++head;
+		}
+		// subtract only
+		while (head < channels + post_pad) {
+			if (head - size >= 0) {
+				accum_scale -= in[(head - size) * step]
+				* in[(head - size) * step];
+			}
+			scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+			++head;
+		}
+	}
 }
-template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
+template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale);
 template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
 
 template <class T>
 __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < nthreads; index += tmp) {
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    T accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff[head * step] * top_data[head * step] /
-          scale[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff[(head - size) * step] *
-            top_data[(head - size) * step] / scale[(head - size) * step];
-      }
-      bottom_diff[(head - post_pad) * step] =
-          top_diff[(head - post_pad) * step]
-            * pow(scale[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff[(head - size) * step] *
-            top_data[(head - size) * step] / scale[(head - size) * step];
-      }
-      bottom_diff[(head - post_pad) * step] =
-          top_diff[(head - post_pad) * step]
-            * pow(scale[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-}
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index += tmp) {
+		const int w = index % width;
+		const int h = (index / width) % height;
+		const int n = index / width / height;
+		const int offset = (n * channels * height + h) * width + w;
+		const int step = height * width;
+		bottom_data += offset;
+		top_data += offset;
+		scale += offset;
+		top_diff += offset;
+		bottom_diff += offset;
+		int head = 0;
+		const int pre_pad = size - (size + 1) / 2;
+		const int post_pad = size - pre_pad - 1;
+		T accum_ratio = 0;
+		// accumulate values
+		while (head < post_pad && head < channels) {
+			accum_ratio += top_diff[head * step] * top_data[head * step] /
+			scale[head * step];
+			++head;
+		}
+		// both add and subtract
+		while (head < channels) {
+			accum_ratio += top_diff[head * step] * top_data[head * step] /
+			scale[head * step];
+			if (head - size >= 0) {
+				accum_ratio -= top_diff[(head - size) * step] *
+				top_data[(head - size) * step] / scale[(head - size) * step];
+			}
+			bottom_diff[(head - post_pad) * step] =
+			top_diff[(head - post_pad) * step]
+			* pow(scale[(head - post_pad) * step], negative_beta)
+			- cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+			++head;
+		}
+		// subtract only
+		while (head < channels + post_pad) {
+			if (head - size >= 0) {
+				accum_ratio -= top_diff[(head - size) * step] *
+				top_data[(head - size) * step] / scale[(head - size) * step];
+			}
+			bottom_diff[(head - post_pad) * step] =
+			top_diff[(head - post_pad) * step]
+			* pow(scale[(head - post_pad) * step], negative_beta)
+			- cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+			++head;
+		}
+	}
 }
 
 template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 10d3b9f5..11352e16 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -25,220 +25,220 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask){
-     int index = get_global_id(0);
-     int tmp = get_global_size(0);
-     for(index; index < nthreads; index += tmp){
-         int pw = index % pooled_width;
-         int ph = (index / pooled_width) % pooled_height;
-         int c = (index / pooled_width / pooled_height) % channels;
-         int n = index / pooled_width / pooled_height / channels;
-         int hstart = ph * stride_h - pad_h;
-         int wstart = pw * stride_w - pad_w;
-         const int hend = min(hstart + kernel_h, height);
-         const int wend = min(wstart + kernel_w, width);
-         hstart = max(hstart, 0);
-         wstart = max(wstart, 0);
-        T maxval = -FLT_MAX;
-        int maxidx = -1;
-        bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-        for (int h = hstart; h < hend; ++h) {
-          for (int w = wstart; w < wend; ++w) {
-           if (bottom_data[h * width + w] > maxval) {
-             maxidx = h * width + w;
-             maxval = bottom_data[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) {
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index += tmp) {
+		int pw = index % pooled_width;
+		int ph = (index / pooled_width) % pooled_height;
+		int c = (index / pooled_width / pooled_height) % channels;
+		int n = index / pooled_width / pooled_height / channels;
+		int hstart = ph * stride_h - pad_h;
+		int wstart = pw * stride_w - pad_w;
+		const int hend = min(hstart + kernel_h, height);
+		const int wend = min(wstart + kernel_w, width);
+		hstart = max(hstart, 0);
+		wstart = max(wstart, 0);
+		T maxval = -FLT_MAX;
+		int maxidx = -1;
+		bottom_data =
+		bottom_data + (n * channels + c) * height * width;
+		for (int h = hstart; h < hend; ++h) {
+			for (int w = wstart; w < wend; ++w) {
+				if (bottom_data[h * width + w] > maxval) {
+					maxidx = h * width + w;
+					maxval = bottom_data[maxidx];
+				}
+			}
+		}
+		top_data[index] = maxval;
+		if (mask) {
+			mask[index] = maxidx;
+		} else {
+			top_mask[index] = maxidx;
+		}
+	}
 }
 template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
-template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,  const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
+template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
 
 template <class T>
-__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < nthreads; index+=tmp){
-        int pw = index % pooled_width;
-        int ph = (index / pooled_width) % pooled_height;
-        int c = (index / pooled_width / pooled_height) % channels;
-        int n = index / pooled_width / pooled_height / channels;            int hstart = ph * stride_h - pad_h;            int wstart = pw * stride_w - pad_w;
-            int hend = min(hstart + kernel_h, height + pad_h);
-            int wend = min(wstart + kernel_w, width + pad_w);
-            const int pool_size = (hend - hstart) * (wend - wstart);
-            hstart = max(hstart, 0);
-            wstart = max(wstart, 0);
-            hend = min(hend, height);
-            wend = min(wend, width);
-            T aveval = 0;
-            bottom_data =
-                bottom_data + (n * channels + c) * height * width;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                aveval += bottom_data[h * width + w];
-              }
-            }
-            top_data[index] = aveval / pool_size;
-          }
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) {
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index+=tmp) {
+		int pw = index % pooled_width;
+		int ph = (index / pooled_width) % pooled_height;
+		int c = (index / pooled_width / pooled_height) % channels;
+		int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
+		int hend = min(hstart + kernel_h, height + pad_h);
+		int wend = min(wstart + kernel_w, width + pad_w);
+		const int pool_size = (hend - hstart) * (wend - wstart);
+		hstart = max(hstart, 0);
+		wstart = max(wstart, 0);
+		hend = min(hend, height);
+		wend = min(wend, width);
+		T aveval = 0;
+		bottom_data =
+		bottom_data + (n * channels + c) * height * width;
+		for (int h = hstart; h < hend; ++h) {
+			for (int w = wstart; w < wend; ++w) {
+				aveval += bottom_data[h * width + w];
+			}
+		}
+		top_data[index] = aveval / pool_size;
+	}
 
 }
 template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
 template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
 
 template <class T>
-__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < nthreads; index+=tmp){
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    T cumsum = 0.;
-    bottom_data = bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_data[h * width + w];
-          return;
-        }
-      }
-    }
-    }
+__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) {
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < nthreads; index+=tmp) {
+		const int pw = index % pooled_width;
+		const int ph = (index / pooled_width) % pooled_height;
+		const int c = (index / pooled_width / pooled_height) % channels;
+		const int n = index / pooled_width / pooled_height / channels;
+		const int hstart = ph * stride_h;
+		const int hend = min(hstart + kernel_h, height);
+		const int wstart = pw * stride_w;
+		const int wend = min(wstart + kernel_w, width);
+		T cumsum = 0.;
+		bottom_data = bottom_data + (n * channels + c) * height * width;
+		// First pass: get sum
+		for (int h = hstart; h < hend; ++h) {
+			for (int w = wstart; w < wend; ++w) {
+				cumsum += bottom_data[h * width + w];
+			}
+		}
+		const float thres = rand_idx[index] * cumsum;
+		// Second pass: get value, and set index.
+		cumsum = 0;
+		for (int h = hstart; h < hend; ++h) {
+			for (int w = wstart; w < wend; ++w) {
+				cumsum += bottom_data[h * width + w];
+				if (cumsum >= thres) {
+					rand_idx[index] = ((n * channels + c) * height + h) * width + w;
+					top_data[index] = bottom_data[h * width + w];
+					return;
+				}
+			}
+		}
+	}
 }
 template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
 template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
 
 template <class T>
-__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,  const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data){
-    int index = get_global_id(0);
-    int tmp = get_global_size(0);
-    for(index; index < count; index+=tmp){
-    const int pw = index % pooled_width; 
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
-    T cumsum = FLT_MIN;
-    T cumvalues = 0.;
-    bottom_data =        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_data[h * width + w];
-        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;  }
+__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) {
+	int index = get_global_id(0);
+	int tmp = get_global_size(0);
+	for(index; index < count; index+=tmp) {
+		const int pw = index % pooled_width;
+		const int ph = (index / pooled_width) % pooled_height;
+		const int c = (index / pooled_width / pooled_height) % channels;
+		const int n = index / pooled_width / pooled_height / channels;
+		const int hstart = ph * stride_h;
+		const int hend = min(hstart + kernel_h, height);
+		const int wstart = pw * stride_w;
+		const int wend = min(wstart + kernel_w, width);
+		// We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
+		T cumsum = FLT_MIN;
+		T cumvalues = 0.;
+		bottom_data = bottom_data + (n * channels + c) * height * width;
+		// First pass: get sum
+		for (int h = hstart; h < hend; ++h) {
+			for (int w = wstart; w < wend; ++w) {
+				cumsum += bottom_data[h * width + w];
+				cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
+			}
+		}
+		top_data[index] = cumvalues / cumsum;}
 }
 template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
 template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
 template <class T>
 __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
-    __global int* mask, __global T* top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, __global T* const bottom_diff) {
-     int index = get_global_id(0);
-     int total = get_global_size(0);
-     for(index; index < nthreads; index += total){
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    T gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    top_diff += offset;
-    if (mask) {
-      mask = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      top_mask = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
+	__global int* mask, __global T* top_mask, const int num,
+	const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+	const int pad_w, __global T* const bottom_diff) {
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	for(index; index < nthreads; index += total) {
+		// find out the local index
+		// find out the local offset
+		const int w = index % width;
+		const int h = (index / width) % height;
+		const int c = (index / width / height) % channels;
+		const int n = index / width / height / channels;
+		const int phstart =
+		(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+		const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+		const int pwstart =
+		(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+		const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+		T gradient = 0;
+		const int offset = (n * channels + c) * pooled_height * pooled_width;
+		top_diff += offset;
+		if (mask) {
+			mask = mask + offset;
+			for (int ph = phstart; ph < phend; ++ph) {
+				for (int pw = pwstart; pw < pwend; ++pw) {
+					if (mask[ph * pooled_width + pw] == h * width + w) {
+						gradient += top_diff[ph * pooled_width + pw];
+					}
+				}
+			}
+		} else {
+			top_mask = top_mask + offset;
+			for (int ph = phstart; ph < phend; ++ph) {
+				for (int pw = pwstart; pw < pwend; ++pw) {
+					if (top_mask[ph * pooled_width + pw] == h * width + w) {
+						gradient += top_diff[ph * pooled_width + pw];
+					}
+				}
+			}
+		}
+		bottom_diff[index] = gradient;
+	}
 }
 template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
 template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class T>
-__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff){
-     int index = get_global_id(0);
-     int total = get_global_size(0);
-     for(index; index < nthreads; index += total){
-            int w = index % width + pad_w;
-            int h = (index / width) % height + pad_h;
-            int c = (index / width / height) % channels;
-            int n = index / width / height / channels;
-            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            const int phend = min(h / stride_h + 1, pooled_height);
-            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            const int pwend = min(w / stride_w + 1, pooled_width);
-            T gradient = 0;
-            top_diff += (n * channels + c) * pooled_height * pooled_width;
-            for (int ph = phstart; ph < phend; ++ph) {
-              for (int pw = pwstart; pw < pwend; ++pw) {
-                // figure out the pooling size
-                int hstart = ph * stride_h - pad_h;
-                int wstart = pw * stride_w - pad_w;
-                int hend = min(hstart + kernel_h, height + pad_h);
-                int wend = min(wstart + kernel_w, width + pad_w);
-                int pool_size = (hend - hstart) * (wend - wstart);
-                gradient += top_diff[ph * pooled_width + pw] / pool_size;
-              }
-    }
-    bottom_diff[index] = gradient;
-   }
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) {
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	for(index; index < nthreads; index += total) {
+		int w = index % width + pad_w;
+		int h = (index / width) % height + pad_h;
+		int c = (index / width / height) % channels;
+		int n = index / width / height / channels;
+		const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+		const int phend = min(h / stride_h + 1, pooled_height);
+		const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+		const int pwend = min(w / stride_w + 1, pooled_width);
+		T gradient = 0;
+		top_diff += (n * channels + c) * pooled_height * pooled_width;
+		for (int ph = phstart; ph < phend; ++ph) {
+			for (int pw = pwstart; pw < pwend; ++pw) {
+				// figure out the pooling size
+				int hstart = ph * stride_h - pad_h;
+				int wstart = pw * stride_w - pad_w;
+				int hend = min(hstart + kernel_h, height + pad_h);
+				int wend = min(wstart + kernel_w, width + pad_w);
+				int pool_size = (hend - hstart) * (wend - wstart);
+				gradient += top_diff[ph * pooled_width + pw] / pool_size;
+			}
+		}
+		bottom_diff[index] = gradient;
+	}
 }
 
 template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
@@ -246,48 +246,48 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave
 
 template <class Dtype>
 __kernel void StoPoolBackward(const int nthreads,
-    __global Dtype* rand_idx, __global Dtype* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global Dtype* bottom_diff) {
-      int index = get_global_id(0);
-      int total = get_global_size(0);
-      for(index; index < nthreads; index += total){
-            // find out the local index
-            // find out the local offset
-            const int w = index % width;
-            const int h = (index / width) % height;
-            const int c = (index / width / height) % channels;
-            const int n = index / width / height / channels;
-            const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-            const int phend = min(h / stride_h + 1, pooled_height);
-            const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-            const int pwend = min(w / stride_w + 1, pooled_width);
-            Dtype gradient = 0;
-            rand_idx =
-                rand_idx + (n * channels + c) * pooled_height * pooled_width;
-            top_diff =
-                top_diff + (n * channels + c) * pooled_height * pooled_width;
-            for (int ph = phstart; ph < phend; ++ph) {
-              for (int pw = pwstart; pw < pwend; ++pw) {
-                gradient += top_diff[ph * pooled_width + pw] *
-                    (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
-              }
-            }
-            bottom_diff[index] = gradient;
+	__global Dtype* rand_idx, __global Dtype* top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int pooled_height, const int pooled_width,
+	const int kernel_h, const int kernel_w, const int stride_h,
+	const int stride_w, __global Dtype* bottom_diff) {
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	for(index; index < nthreads; index += total) {
+		// find out the local index
+		// find out the local offset
+		const int w = index % width;
+		const int h = (index / width) % height;
+		const int c = (index / width / height) % channels;
+		const int n = index / width / height / channels;
+		const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+		const int phend = min(h / stride_h + 1, pooled_height);
+		const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+		const int pwend = min(w / stride_w + 1, pooled_width);
+		Dtype gradient = 0;
+		rand_idx =
+		rand_idx + (n * channels + c) * pooled_height * pooled_width;
+		top_diff =
+		top_diff + (n * channels + c) * pooled_height * pooled_width;
+		for (int ph = phstart; ph < phend; ++ph) {
+			for (int pw = pwstart; pw < pwend; ++pw) {
+				gradient += top_diff[ph * pooled_width + pw] *
+				(index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
+			}
+		}
+		bottom_diff[index] = gradient;
 
-	  }
+	}
 }
-template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel  void StoPoolBackward<float>(const int nthreads,
-    __global float* rand_idx, __global float* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global float* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward<float>(const int nthreads,
+	__global float* rand_idx, __global float* top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int pooled_height, const int pooled_width,
+	const int kernel_h, const int kernel_w, const int stride_h,
+	const int stride_w, __global float* bottom_diff);
 template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward<double>(const int nthreads,
-    __global double* rand_idx, __global double* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global double* bottom_diff);
+	__global double* rand_idx, __global double* top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int pooled_height, const int pooled_width,
+	const int kernel_h, const int kernel_w, const int stride_h,
+	const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index 6a45ea03..5fbea781 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -26,35 +26,35 @@
 
 template <class T>
 __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) {
-  int index = get_global_id(0);
-  if(index < count){
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		int c = (index / dim) % channels / div_factor;
+		out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+	}
 }
 template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor);
 template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor);
 
 template <class T>
 __kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) {
-  int index = get_global_id(0);
-  if(index < count){
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * slope_data[c]);
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		int c = (index / dim) % channels / div_factor;
+		out_diff[index] = in_diff[index] * ((in_data[index] > 0)
+			+ (in_data[index] <= 0) * slope_data[c]);
+	}
 }
 template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor);
 template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
 
 template <class T>
 __kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) {
-  int index = get_global_id(0);
-  if(index < count){
-    in_diff += offset_out;
-    out_diff +=  offset_in;
-    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		in_diff += offset_out;
+		out_diff += offset_in;
+		out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+	}
 }
-template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out,  __global float* in_data, const int offset_in, __global float* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff);
 template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index f5a7a4db..94a41db4 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -30,23 +30,26 @@
 //we use the open sourced threefry's GPU implementation
 typedef uint uint32_t;
 
-struct r123array4x32 {	uint32_t v[4]; };
+struct r123array4x32 {
+		uint32_t v[4];
+};
 
-enum r123_enum_threefry32x4 
+enum r123_enum_threefry32x4
 {
 	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
 	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
 	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
-	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
-	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
+	R_32x4_3_0 = 23, R_32x4_3_1 = 5,
+	R_32x4_4_0 = 6, R_32x4_4_1 = 20,
 	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
 	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
 	R_32x4_7_0 = 18, R_32x4_7_1 = 20
 };
 
-inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
 inline uint32_t RotL_32(uint32_t x, unsigned int N)
-{
+	__attribute__((always_inline));
+inline uint32_t RotL_32(uint32_t x, unsigned int N)
+	{
 	return (x << (N & 31)) | (x >> ((32 - N) & 31));
 }
 
@@ -54,20 +57,22 @@ typedef struct r123array4x32 threefry4x32_ctr_t;
 typedef struct r123array4x32 threefry4x32_key_t;
 typedef struct r123array4x32 threefry4x32_ukey_t;
 
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
-{
-	threefry4x32_ctr_t	X;
-	uint32_t			ks[4 + 1];
-	int					i;
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+	threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+	threefry4x32_ctr_t in, threefry4x32_key_t k)
+	{
+	threefry4x32_ctr_t X;
+	uint32_t ks[4 + 1];
+	int i;
 	ks[4] = 0x1BD11BDA;
 	/*
-	for (i = 0; i < 4; i++)
-	{
-		ks[i] = k.v[i];
-		X.v[i] = in.v[i];
-		ks[4] ^= k.v[i];
-	}*/ 
+	 for (i = 0; i < 4; i++)
+	 {
+	 ks[i] = k.v[i];
+	 X.v[i] = in.v[i];
+	 ks[4] ^= k.v[i];
+	 }*/
 	{
 		ks[0] = k.v[0];
 		X.v[0] = in.v[0];
@@ -89,660 +94,748 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_
 	X.v[1] += ks[1];
 	X.v[2] += ks[2];
 	X.v[3] += ks[3];
-	if (Nrounds > 0) 
-	{
+	if (Nrounds > 0)
+		{
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 1) {
+	}
+	if (Nrounds > 1) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 2) {
+	}
+	if (Nrounds > 2) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 3) {
+	}
+	if (Nrounds > 3) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 3) {
+	}
+	if (Nrounds > 3) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 1;
-	} if (Nrounds > 4) {
+	}
+	if (Nrounds > 4) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 5) {
+	}
+	if (Nrounds > 5) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 6) {
+	}
+	if (Nrounds > 6) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 7) {
+	}
+	if (Nrounds > 7) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 7) {
+	}
+	if (Nrounds > 7) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 2;
-	} if (Nrounds > 8) {
+	}
+	if (Nrounds > 8) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 9) {
+	}
+	if (Nrounds > 9) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 10) {
+	}
+	if (Nrounds > 10) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 11) {
+	}
+	if (Nrounds > 11) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 11) {
+	}
+	if (Nrounds > 11) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 3;
-	} if (Nrounds > 12) {
+	}
+	if (Nrounds > 12) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 13) {
+	}
+	if (Nrounds > 13) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 14) {
+	}
+	if (Nrounds > 14) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 15) {
+	}
+	if (Nrounds > 15) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 15) {
+	}
+	if (Nrounds > 15) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 4;
-	} if (Nrounds > 16) {
+	}
+	if (Nrounds > 16) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 17) {
+	}
+	if (Nrounds > 17) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 18) {
+	}
+	if (Nrounds > 18) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 19) {
+	}
+	if (Nrounds > 19) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 19) {
+	}
+	if (Nrounds > 19) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 5;
-	} if (Nrounds > 20) {
+	}
+	if (Nrounds > 20) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 21) {
+	}
+	if (Nrounds > 21) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 22) {
+	}
+	if (Nrounds > 22) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 23) {
+	}
+	if (Nrounds > 23) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 23) {
+	}
+	if (Nrounds > 23) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 6;
-	} if (Nrounds > 24) {
+	}
+	if (Nrounds > 24) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 25) {
+	}
+	if (Nrounds > 25) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 26) {
+	}
+	if (Nrounds > 26) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 27) {
+	}
+	if (Nrounds > 27) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 27) {
+	}
+	if (Nrounds > 27) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 7;
-	} if (Nrounds > 28) {
+	}
+	if (Nrounds > 28) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 29) {
+	}
+	if (Nrounds > 29) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 30) {
+	}
+	if (Nrounds > 30) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 31) {
+	}
+	if (Nrounds > 31) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 31) {
+	}
+	if (Nrounds > 31) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 8;
-	} if (Nrounds > 32) {
+	}
+	if (Nrounds > 32) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 33) {
+	}
+	if (Nrounds > 33) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 34) {
+	}
+	if (Nrounds > 34) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 35) {
+	}
+	if (Nrounds > 35) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 35) {
+	}
+	if (Nrounds > 35) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 9;
-	} if (Nrounds > 36) {
+	}
+	if (Nrounds > 36) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 37) {
+	}
+	if (Nrounds > 37) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 38) {
+	}
+	if (Nrounds > 38) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 39) {
+	}
+	if (Nrounds > 39) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 39) {
+	}
+	if (Nrounds > 39) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 10;
-	} if (Nrounds > 40) {
+	}
+	if (Nrounds > 40) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 41) {
+	}
+	if (Nrounds > 41) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 42) {
+	}
+	if (Nrounds > 42) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 43) {
+	}
+	if (Nrounds > 43) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 43) {
+	}
+	if (Nrounds > 43) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 11;
-	} if (Nrounds > 44) {
+	}
+	if (Nrounds > 44) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 45) {
+	}
+	if (Nrounds > 45) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 46) {
+	}
+	if (Nrounds > 46) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 47) {
+	}
+	if (Nrounds > 47) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 47) {
+	}
+	if (Nrounds > 47) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 12;
-	} if (Nrounds > 48) {
+	}
+	if (Nrounds > 48) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 49) {
+	}
+	if (Nrounds > 49) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 50) {
+	}
+	if (Nrounds > 50) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 51) {
+	}
+	if (Nrounds > 51) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 51) {
+	}
+	if (Nrounds > 51) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 13;
-	} if (Nrounds > 52) {
+	}
+	if (Nrounds > 52) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 53) {
+	}
+	if (Nrounds > 53) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 54) {
+	}
+	if (Nrounds > 54) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 55) {
+	}
+	if (Nrounds > 55) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 55) {
+	}
+	if (Nrounds > 55) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 14;
-	} if (Nrounds > 56) {
+	}
+	if (Nrounds > 56) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 57) {
+	}
+	if (Nrounds > 57) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 58) {
+	}
+	if (Nrounds > 58) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 59) {
+	}
+	if (Nrounds > 59) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 59) {
+	}
+	if (Nrounds > 59) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 15;
-	} if (Nrounds > 60) {
+	}
+	if (Nrounds > 60) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 61) {
+	}
+	if (Nrounds > 61) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 62) {
+	}
+	if (Nrounds > 62) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 63) {
+	}
+	if (Nrounds > 63) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 63) {
+	}
+	if (Nrounds > 63) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 16;
-	} if (Nrounds > 64) {
+	}
+	if (Nrounds > 64) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 65) {
+	}
+	if (Nrounds > 65) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 66) {
+	}
+	if (Nrounds > 66) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 67) {
+	}
+	if (Nrounds > 67) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 67) {
+	}
+	if (Nrounds > 67) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 17;
-	} if (Nrounds > 68) {
+	}
+	if (Nrounds > 68) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 69) {
+	}
+	if (Nrounds > 69) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 70) {
+	}
+	if (Nrounds > 70) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	} if (Nrounds > 71) {
+	}
+	if (Nrounds > 71) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	} if (Nrounds > 71) {
+	}
+	if (Nrounds > 71) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 18;
-	} 
+	}
 	return X;
-} 
+}
 
 template <class T>
 __kernel void PRNG_threefry4x32(
-        __global uint4 *randomnumber,
-        threefry4x32_ctr_t ctr_i,
-        T inf,
-        T sup,
-        T threshold,
-        uint nrounds,
-        uint numrandom
-){
-        size_t  gdx = get_global_id(0);
+	__global uint4 *randomnumber,
+	threefry4x32_ctr_t ctr_i,
+	T inf,
+	T sup,
+	T threshold,
+	uint nrounds,
+	uint numrandom
+) {
+	size_t gdx = get_global_id(0);
 
-        uint maxUint = 0;
-        maxUint--;
-        float r = (float)maxUint;
+	uint maxUint = 0;
+	maxUint--;
+	float r = (float)maxUint;
 
-        threefry4x32_ctr_t      ctr = ctr_i; 
-        threefry4x32_ukey_t ukey;
+	threefry4x32_ctr_t ctr = ctr_i;
+	threefry4x32_ukey_t ukey;
 
-        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+	ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
 
-        threefry4x32_ctr_t  random4;
+	threefry4x32_ctr_t random4;
 
-        if ( gdx < numrandom )
-        {
-                random4 = threefry4x32_R(nrounds, ctr, ukey);
-                uint4 frnd;
-				
-                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-				
-                randomnumber[gdx] = frnd;
-        }
-}
+	if ( gdx < numrandom )
+	{
+		random4 = threefry4x32_R(nrounds, ctr, ukey);
+		uint4 frnd;
 
+		frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+		frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+		frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+		frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+
+		randomnumber[gdx] = frnd;
+	}
+}
 
 template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
 
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index b7865838..cf9302d5 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -25,21 +25,21 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope){
+__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) {
 	int index = get_global_id(0);
 	if(index < count)
-		out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
+	out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
 }
 
 template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
 template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
 
 template <class T>
-__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope){
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) {
 	int index = get_global_id(0);
-        if(index < count) {
-            out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
-        }
+	if(index < count) {
+		out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
+	}
 }
 
 template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl
index eb952e6f..a3a9345f 100644
--- a/src/caffe/ocl/sigmoid_layer.cl
+++ b/src/caffe/ocl/sigmoid_layer.cl
@@ -25,21 +25,21 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void SigmoidForward(const int count, __global T* in, __global T* out){
+__kernel void SigmoidForward(const int count, __global T* in, __global T* out) {
 	int index = get_global_id(0);
 	if(index < count)
-		out[index] = 1. / (1. + exp(-in[index]));
+	out[index] = 1. / (1. + exp(-in[index]));
 }
 
 template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out);
 template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out);
 
 template <class T>
-__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){
+__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
 	int index = get_global_id(0);
-        const T sigmoid_x = out_data[index];
-        if(index < count)
-		out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
+	const T sigmoid_x = out_data[index];
+	if(index < count)
+	out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
 }
 
 template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
index 6b225283..4069ce16 100644
--- a/src/caffe/ocl/softmax_layer.cl
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -25,49 +25,49 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch){
-    
-    int gid = get_global_id(0);
-    int size = get_global_size(0);
-    
-    resultScratch[gid] = 0.0;
-    for(int i = gid; i < num; i += size){
-    	resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    if(gid < 128)
-    	resultScratch[gid] += resultScratch[gid + 128];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(gid < 64)
-    	resultScratch[gid] += resultScratch[gid + 64];
-    if(gid < 32)
-    	resultScratch[gid] += resultScratch[gid + 32];
-    if(gid < 16)
-    	resultScratch[gid] += resultScratch[gid + 16];
-    if(gid < 8)
-    	resultScratch[gid] += resultScratch[gid + 8];
-    if(gid < 4)
-    	resultScratch[gid] += resultScratch[gid + 4];
-    if(gid < 2)
-    	resultScratch[gid] += resultScratch[gid + 2];
-    if(gid < 1){
-    	resultScratch[gid] += resultScratch[gid + 1];
-    	loss[0] = resultScratch[gid];
-    }
+__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) {
+
+	int gid = get_global_id(0);
+	int size = get_global_size(0);
+
+	resultScratch[gid] = 0.0;
+	for(int i = gid; i < num; i += size) {
+		resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(gid < 128)
+	resultScratch[gid] += resultScratch[gid + 128];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(gid < 64)
+	resultScratch[gid] += resultScratch[gid + 64];
+	if(gid < 32)
+	resultScratch[gid] += resultScratch[gid + 32];
+	if(gid < 16)
+	resultScratch[gid] += resultScratch[gid + 16];
+	if(gid < 8)
+	resultScratch[gid] += resultScratch[gid + 8];
+	if(gid < 4)
+	resultScratch[gid] += resultScratch[gid + 4];
+	if(gid < 2)
+	resultScratch[gid] += resultScratch[gid + 2];
+	if(gid < 1) {
+		resultScratch[gid] += resultScratch[gid + 1];
+		loss[0] = resultScratch[gid];
+	}
 }
 template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
 template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
 
 template <class T>
-__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data){
-        //printf("softmax_div\n");
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        for(index; index < num*dim; index +=  total){
-        int n = index / dim;
-        data[index] /= scale[n];
-        }
+__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) {
+	//printf("softmax_div\n");
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	for(index; index < num*dim; index += total) {
+		int n = index / dim;
+		data[index] /= scale[n];
+	}
 }
 
 template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
@@ -75,97 +75,97 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma
 
 template <class T>
 __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* out) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
+	const int spatial_dim, __global const T* data, __global T* out) {
+	int index = get_global_id(0);
+	if(index < num * spatial_dim) {
+		int n = index / spatial_dim;
+		int s = index % spatial_dim;
+		T maxval = -FLT_MAX;
+		for (int c = 0; c < channels; ++c) {
+			maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+		}
+		out[index] = maxval;
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* out);
+	const int spatial_dim, __global const float* data, __global float* out);
 template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global  double* out);
+	const int spatial_dim, __global const double* data, __global double* out);
 
 template <class T>
 __kernel void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_max, __global T* data) {
-    int index = get_global_id(0);
-    if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s]; 
-  }
+	const int num, const int channels,
+	const int spatial_dim, __global const T* channel_max, __global T* data) {
+	int index = get_global_id(0);
+	if(index < count) {
+		int n = index / channels / spatial_dim;
+		int s = index % spatial_dim;
+		data[index] -= channel_max[n * spatial_dim + s];
+	}
 }
 template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
 template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
 
 template <class T>
 __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const T* data, __global T* channel_sum) {
-  int index = get_global_id(0);
-   if(index < num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    T sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
+	const int spatial_dim, __global const T* data, __global T* channel_sum) {
+	int index = get_global_id(0);
+	if(index < num * spatial_dim) {
+		int n = index / spatial_dim;
+		int s = index % spatial_dim;
+		T sum = 0;
+		for (int c = 0; c < channels; ++c) {
+			sum += data[(n * channels + c) * spatial_dim + s];
+		}
+		channel_sum[index] = sum;
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const float* data, __global float* channel_sum);
+	const int spatial_dim, __global const float* data, __global float* channel_sum);
 template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, __global const double* data, __global double* channel_sum);
+	const int spatial_dim, __global const double* data, __global double* channel_sum);
 
 template <class T>
 __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const T* channel_sum, __global T* data) {
-    int index = get_global_id(0);
-   if(index < count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
+	const int num, const int channels,
+	const int spatial_dim, __global const T* channel_sum, __global T* data) {
+	int index = get_global_id(0);
+	if(index < count) {
+		int n = index / channels / spatial_dim;
+		int s = index % spatial_dim;
+		data[index] /= channel_sum[n * spatial_dim + s];
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const float* channel_sum, __global float* data);
+	const int num, const int channels,
+	const int spatial_dim, __global const float* channel_sum, __global float* data);
 template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, __global const double* channel_sum, __global double* data);
-                                                                                         
+	const int num, const int channels,
+	const int spatial_dim, __global const double* channel_sum, __global double* data);
+
 template <class T>
 __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const T* data_1, __global const T* data_2,
-    __global T* channel_dot) {
-    int index = get_global_id(0);
-    if(index < num * spatial_dim) {
-        int n = index / spatial_dim;
-        int s = index % spatial_dim;
-        T dot = 0;
-        for (int c = 0; c < channels; ++c) {
-            dot += (data_1[(n * channels + c) * spatial_dim + s]
-                 * data_2[(n * channels + c) * spatial_dim + s]);
-        }   
-        channel_dot[index] = dot;
-    }   
+	const int spatial_dim, __global const T* data_1, __global const T* data_2,
+	__global T* channel_dot) {
+	int index = get_global_id(0);
+	if(index < num * spatial_dim) {
+		int n = index / spatial_dim;
+		int s = index % spatial_dim;
+		T dot = 0;
+		for (int c = 0; c < channels; ++c) {
+			dot += (data_1[(n * channels + c) * spatial_dim + s]
+				* data_2[(n * channels + c) * spatial_dim + s]);
+		}
+		channel_dot[index] = dot;
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const float* data_1, __global const float* data_2,
-    __global float* channel_dot);
+	const int spatial_dim, __global const float* data_1, __global const float* data_2,
+	__global float* channel_dot);
 template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, __global const double* data_1, __global const double* data_2,
-    __global double* channel_dot);
+	const int spatial_dim, __global const double* data_1, __global const double* data_2,
+	__global double* channel_dot);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index 9dbe284f..025f59ac 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -26,78 +26,78 @@
 
 template <class T>
 __kernel void SoftmaxLossForwardGPU(const int nthreads,
-          __global T* prob_data, __global T* label,__global T* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global T* counts) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-        const int n = index / spatial_dim;
-        const int s = index % spatial_dim;
-        const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-        if (has_ignore_label_ && label_value == ignore_label_) {
-           loss[index] = 0;
-           counts[index] = 0;
-        } else {
-           loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      T(FLT_MIN)));
-        counts[index] = 1;
-    }
-  }
+	__global T* prob_data, __global T* label,__global T* loss,
+	int num, int dim, int spatial_dim,
+	bool has_ignore_label_, int ignore_label_,
+	__global T* counts) {
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		const int n = index / spatial_dim;
+		const int s = index % spatial_dim;
+		const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+		if (has_ignore_label_ && label_value == ignore_label_) {
+			loss[index] = 0;
+			counts[index] = 0;
+		} else {
+			loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+					T(FLT_MIN)));
+			counts[index] = 1;
+		}
+	}
 }
 
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-          __global float* prob_data, __global float* label,__global float* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global float* counts);
+	__global float* prob_data, __global float* label,__global float* loss,
+	int num, int dim, int spatial_dim,
+	bool has_ignore_label_, int ignore_label_,
+	__global float* counts);
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-          __global double* prob_data, __global double* label,__global double* loss,
-          int num, int dim, int spatial_dim,
-          bool has_ignore_label_, int ignore_label_,
-          __global double* counts);
+	__global double* prob_data, __global double* label,__global double* loss,
+	int num, int dim, int spatial_dim,
+	bool has_ignore_label_, int ignore_label_,
+	__global double* counts);
 
 template <class T>
 __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
-          __global T* label,__global T* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, T* counts) {
-    const int channels = dim / spatial_dim;
-   int index  = get_global_id(0);
-   if(index <  nthreads) {
-       const int n = index / spatial_dim;
-       const int s = index % spatial_dim;
-       const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+	__global T* label,__global T* bottom_diff, int num, int dim,
+	int spatial_dim, bool has_ignore_label_,
+	int ignore_label_, T* counts) {
+	const int channels = dim / spatial_dim;
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		const int n = index / spatial_dim;
+		const int s = index % spatial_dim;
+		const int label_value = static_cast<int>(label[n * spatial_dim + s]);
 
-      if (has_ignore_label_ && label_value == ignore_label_) {
-          for (int c = 0; c < channels; ++c) {
-              bottom_diff[n * dim + c * spatial_dim + s] = 0;
-          }
-          counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
+		if (has_ignore_label_ && label_value == ignore_label_) {
+			for (int c = 0; c < channels; ++c) {
+				bottom_diff[n * dim + c * spatial_dim + s] = 0;
+			}
+			counts[index] = 0;
+		} else {
+			bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+			counts[index] = 1;
+		}
+	}
 }
 template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
-          __global float* label,__global float* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, float* counts);
+	__global float* label,__global float* bottom_diff, int num, int dim,
+	int spatial_dim, bool has_ignore_label_,
+	int ignore_label_, float* counts);
 
-template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double)))  __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
-          __global double* label,__global double* bottom_diff, int num, int dim,
-          int spatial_dim, bool has_ignore_label_,
-          int ignore_label_, double* counts);
+template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+	__global double* label,__global double* bottom_diff, int num, int dim,
+	int spatial_dim, bool has_ignore_label_,
+	int ignore_label_, double* counts);
 
 template <class T>
-__kernel void scal (const int num, const T alpha, __global T* data){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        for(index; index < num; index +=  total){
-        data[index] = data[index] * alpha;
-        }
+__kernel void scal (const int num, const T alpha, __global T* data) {
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	for(index; index < num; index += total) {
+		data[index] = data[index] * alpha;
+	}
 }
 
-template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha,  __global float* data);
-template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha,  __global double* data);
+template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data);
+template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data);
diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl
index 2f0a08c6..a8bd05c9 100644
--- a/src/caffe/ocl/tanh_layer.cl
+++ b/src/caffe/ocl/tanh_layer.cl
@@ -25,21 +25,21 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void TanHForward(const int count, __global T* in, __global T* out){
+__kernel void TanHForward(const int count, __global T* in, __global T* out) {
 	int index = get_global_id(0);
 	if(index < count)
-		out[index] =tanh(in[index]);
+	out[index] =tanh(in[index]);
 }
 
 template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out);
 template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out);
 
 template <class T>
-__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff){
+__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
 	int index = get_global_id(0);
-        const T tanhx = out_data[index];
-        if(index < count)
-		out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
+	const T tanhx = out_data[index];
+	if(index < count)
+	out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
 }
 
 template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl
index 40d55f1c..19df83e2 100644
--- a/src/caffe/ocl/threshold_layer.cl
+++ b/src/caffe/ocl/threshold_layer.cl
@@ -25,10 +25,10 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out){
+__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) {
 	int index = get_global_id(0);
 	if(index < count)
-		out[index] =in[index] > threshold ? 1 : 0;
+	out[index] =in[index] > threshold ? 1 : 0;
 }
 
 template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out);
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index d15f168c..07a16fbd 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -27,10 +27,10 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
 template <class T>
-__kernel void OCL_memset(__global T* buffer, const T value, const int size){
+__kernel void OCL_memset(__global T* buffer, const T value, const int size) {
 	int gdx = get_global_id(0);
-	if(gdx < size){
-		buffer[gdx] = value;	
+	if(gdx < size) {
+		buffer[gdx] = value;
 	}
 }
 
@@ -38,19 +38,19 @@ template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__gl
 template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
 template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
 
-__kernel void OCL_memset2(__global int* buffer, const int value, const int size){
-        int gdx = get_global_id(0);
-        if(gdx < size){
-                buffer[gdx] = value;    
-        }
+__kernel void OCL_memset2(__global int* buffer, const int value, const int size) {
+	int gdx = get_global_id(0);
+	if(gdx < size) {
+		buffer[gdx] = value;
+	}
 }
 
 template <class T>
-__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y){
-     int gdx = get_global_id(0);
-     if(gdx < N){
-          Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
-     }
+__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) {
+	int gdx = get_global_id(0);
+	if(gdx < N) {
+		Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
+	}
 }
 
 template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
@@ -58,33 +58,33 @@ template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caff
 
 template <class T>
 __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) {
-    int index = get_global_id(0);
-    if(index < n) {
-        y[index] = fabs(a[index]);
-    }
+	int index = get_global_id(0);
+	if(index < n) {
+		y[index] = fabs(a[index]);
+	}
 }
 template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y);
 template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y);
 
 template <class T>
-__kernel void get_max(const int num, const int dim, __global T* data, __global T* out){
-     int index = get_global_id(0);
-     if (index < num) {
-	T maxval = -FLT_MAX;
-        for (int i = 0; i <  dim; i++)
-	maxval = max( data[index*dim + i], maxval );
-        out[index] = maxval;
-      }
+__kernel void get_max(const int num, const int dim, __global T* data, __global T* out) {
+	int index = get_global_id(0);
+	if (index < num) {
+		T maxval = -FLT_MAX;
+		for (int i = 0; i < dim; i++)
+		maxval = max( data[index*dim + i], maxval );
+		out[index] = maxval;
+	}
 }
 
 template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
 template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
 
 template <class T>
-__kernel void exp (const int num, __global T* data, __global T* out){
-        int index = get_global_id(0);
-        if (index < num) 
-        out[index] = exp(data[index]);
+__kernel void exp (const int num, __global T* data, __global T* out) {
+	int index = get_global_id(0);
+	if (index < num)
+	out[index] = exp(data[index]);
 }
 
 template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
@@ -92,10 +92,10 @@ template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int
 
 template <class T>
 __kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = a[index] - b[index];
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = a[index] - b[index];
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -103,10 +103,10 @@ template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = a[index] + b[index];
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = a[index] + b[index];
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -114,10 +114,10 @@ template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = a[index] / b[index];
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = a[index] / b[index];
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -125,34 +125,32 @@ template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = a[index] * b[index];
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = a[index] * b[index];
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out);
 template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out);
 
-
 template <class T>
 __kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = pow(data[index], alpha);
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = pow(data[index], alpha);
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out);
 template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out);
 
-
 template <class T>
 __kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-    out[index] = exp(data[index]);
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = exp(data[index]);
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
@@ -160,10 +158,10 @@ template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_add_scalar(const int count, const T data, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-     out[index] = out[index] + data;
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = out[index] + data;
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out);
@@ -171,79 +169,76 @@ template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void
 
 template <class T>
 __kernel void kernel_log(const int count, __global const T* data, __global T* out) {
- int index = get_global_id(0);
-   if(index < count) {
-     out[index] = log(data[index]);
-  }
+	int index = get_global_id(0);
+	if(index < count) {
+		out[index] = log(data[index]);
+	}
 }
 
 template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out);
 template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out);
 
 template <class T>
-__kernel void diff (const int num, const int dim, __global T* data, __global T* label){
-        int index = get_global_id(0);
-        int total = get_global_size(0);
-        int offset;
-	for(index; index < num; index +=  total){
-  	offset = (int) label[index];
-        data[index * dim + offset] -= 1;
-        }
+__kernel void diff (const int num, const int dim, __global T* data, __global T* label) {
+	int index = get_global_id(0);
+	int total = get_global_size(0);
+	int offset;
+	for(index; index < num; index += total) {
+		offset = (int) label[index];
+		data[index * dim + offset] -= 1;
+	}
 }
 
 template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
 template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
 
-
 template <class T>
-__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y){
+__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) {
 	int index = get_global_id(0);
-        if (index < n)
-        y[index] = a[index] / b[index];
+	if (index < n)
+	y[index] = a[index] / b[index];
 }
 
 template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
 //template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
 
 template <class T>
-__kernel void add_scalar (const int n, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] += alpha;
+__kernel void add_scalar (const int n, const T alpha, __global T* y) {
+	int index = get_global_id(0);
+	if (index < n)
+	y[index] += alpha;
 }
 
 template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
 template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
 
 template <typename Dtype>
-__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
-        int index = get_global_id(0);
-        if (index < n)
-        y[index] = in1[index] + in2[index] ;
+__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
+	int index = get_global_id(0);
+	if (index < n)
+	y[index] = in1[index] + in2[index];
 }
 template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
 template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
 
 template <class T>
-__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y){
-        int index = get_global_id(0);
-       if (index < n)
-        y[index] = a[index] * b[index];
+__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) {
+	int index = get_global_id(0);
+	if (index < n)
+	y[index] = a[index] * b[index];
 }
 
 template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
 template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
 
-
 template <class T>
-__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y){
-        int index = get_global_id(0);
-        if (index < n)
+__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) {
+	int index = get_global_id(0);
+	if (index < n)
 //           y[index] = a[index] + alpha;
-           y[index] = pow(a[index], alpha);
+	y[index] = pow(a[index], alpha);
 }
 
-template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); 
-template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); 
-
+template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y);
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index cd9d2ef5..ae675500 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -13,398 +13,397 @@
 #include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
 
-template <typename Dtype>
+template<typename Dtype>
 Solver<Dtype>::Solver(const SolverParameter& param)
-    : net_() {
-  Init(param);
+	: net_() {
+	Init(param);
 }
 
-template <typename Dtype>
-void Solver<Dtype>::ocl_setup(){
-   scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
-   add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
-   div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
-   powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
+template<typename Dtype>
+void Solver<Dtype>::ocl_setup() {
+	scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
+	add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
+	div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
+	powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
-    : net_() {
-  SolverParameter param;
-  ReadProtoFromTextFileOrDie(param_file, &param);
-  Init(param);
+	: net_() {
+	SolverParameter param;
+	ReadProtoFromTextFileOrDie(param_file, &param);
+	Init(param);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  LOG(INFO) << "Initializing solver from parameters: " << std::endl
-            << param.DebugString();
-  param_ = param;
-  CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-
-  ocl_setup();
-
-  if (param_.random_seed() >= 0) {
-    Caffe::set_random_seed(param_.random_seed());
-  }
-  // Scaffolding code
-  InitTrainNet();
-  InitTestNets();
-  LOG(INFO) << "Solver scaffolding done.";
-  iter_ = 0;
-  current_step_ = 0;
+	LOG(INFO) << "Initializing solver from parameters: " << std::endl
+		<< param.DebugString();
+	param_ = param;
+	CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+
+	ocl_setup();
+
+	if (param_.random_seed() >= 0) {
+		Caffe::set_random_seed(param_.random_seed());
+	}
+	// Scaffolding code
+	InitTrainNet();
+	InitTestNets();
+	LOG(INFO) << "Solver scaffolding done.";
+	iter_ = 0;
+	current_step_ = 0;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
-  const int num_train_nets = param_.has_net() + param_.has_net_param() +
-      param_.has_train_net() + param_.has_train_net_param();
-  const string& field_names = "net, net_param, train_net, train_net_param";
-  CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
-      << "using one of these fields: " << field_names;
-  CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
-      << "one of these fields specifying a train_net: " << field_names;
-  NetParameter net_param;
-  if (param_.has_train_net_param()) {
-    LOG(INFO) << "Creating training net specified in train_net_param.";
-    net_param.CopyFrom(param_.train_net_param());
-  } else if (param_.has_train_net()) {
-    LOG(INFO) << "Creating training net from train_net file: "
-              << param_.train_net();
-    ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
-  }
-  if (param_.has_net_param()) {
-    LOG(INFO) << "Creating training net specified in net_param.";
-    net_param.CopyFrom(param_.net_param());
-  }
-  if (param_.has_net()) {
-    LOG(INFO) << "Creating training net from net file: " << param_.net();
-    ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
-  }
-  // Set the correct NetState.  We start with the solver defaults (lowest
-  // precedence); then, merge in any NetState specified by the net_param itself;
-  // finally, merge in any NetState specified by the train_state (highest
-  // precedence).
-  NetState net_state;
-  net_state.set_phase(TRAIN);
-  net_state.MergeFrom(net_param.state());
-  net_state.MergeFrom(param_.train_state());
-  net_param.mutable_state()->CopyFrom(net_state);
-  net_.reset(new Net<Dtype>(net_param));
+	const int num_train_nets = param_.has_net() + param_.has_net_param() +
+		param_.has_train_net() + param_.has_train_net_param();
+	const string& field_names = "net, net_param, train_net, train_net_param";
+	CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
+		<< "using one of these fields: " << field_names;
+	CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
+		<< "one of these fields specifying a train_net: " << field_names;
+	NetParameter net_param;
+	if (param_.has_train_net_param()) {
+		LOG(INFO) << "Creating training net specified in train_net_param.";
+		net_param.CopyFrom(param_.train_net_param());
+	} else if (param_.has_train_net()) {
+		LOG(INFO) << "Creating training net from train_net file: "
+			<< param_.train_net();
+		ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
+	}
+	if (param_.has_net_param()) {
+		LOG(INFO) << "Creating training net specified in net_param.";
+		net_param.CopyFrom(param_.net_param());
+	}
+	if (param_.has_net()) {
+		LOG(INFO) << "Creating training net from net file: " << param_.net();
+		ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
+	}
+	// Set the correct NetState.  We start with the solver defaults (lowest
+	// precedence); then, merge in any NetState specified by the net_param itself;
+	// finally, merge in any NetState specified by the train_state (highest
+	// precedence).
+	NetState net_state;
+	net_state.set_phase(TRAIN);
+	net_state.MergeFrom(net_param.state());
+	net_state.MergeFrom(param_.train_state());
+	net_param.mutable_state()->CopyFrom(net_state);
+	net_.reset(new Net<Dtype>(net_param));
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  const bool has_net_param = param_.has_net_param();
-  const bool has_net_file = param_.has_net();
-  const int num_generic_nets = has_net_param + has_net_file;
-  CHECK_LE(num_generic_nets, 1)
-      << "Both net_param and net_file may not be specified.";
-  const int num_test_net_params = param_.test_net_param_size();
-  const int num_test_net_files = param_.test_net_size();
-  const int num_test_nets = num_test_net_params + num_test_net_files;
-  if (num_generic_nets) {
-      CHECK_GE(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
-  } else {
-      CHECK_EQ(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
-  }
-  // If we have a generic net (specified by net or net_param, rather than
-  // test_net or test_net_param), we may have an unlimited number of actual
-  // test networks -- the actual number is given by the number of remaining
-  // test_iters after any test nets specified by test_net_param and/or test_net
-  // are evaluated.
-  const int num_generic_net_instances = param_.test_iter_size() - num_test_nets;
-  const int num_test_net_instances = num_test_nets + num_generic_net_instances;
-  if (param_.test_state_size()) {
-    CHECK_EQ(param_.test_state_size(), num_test_net_instances)
-        << "test_state must be unspecified or specified once per test net.";
-  }
-  if (num_test_net_instances) {
-    CHECK_GT(param_.test_interval(), 0);
-  }
-  int test_net_id = 0;
-  vector<string> sources(num_test_net_instances);
-  vector<NetParameter> net_params(num_test_net_instances);
-  for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net_param";
-      net_params[test_net_id].CopyFrom(param_.test_net_param(i));
-  }
-  for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net file: " + param_.test_net(i);
-      ReadNetParamsFromTextFileOrDie(param_.test_net(i),
-          &net_params[test_net_id]);
-  }
-  const int remaining_test_nets = param_.test_iter_size() - test_net_id;
-  if (has_net_param) {
-    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-      sources[test_net_id] = "net_param";
-      net_params[test_net_id].CopyFrom(param_.net_param());
-    }
-  }
-  if (has_net_file) {
-    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-      sources[test_net_id] = "net file: " + param_.net();
-      ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]);
-    }
-  }
-  test_nets_.resize(num_test_net_instances);
-  for (int i = 0; i < num_test_net_instances; ++i) {
-    // Set the correct NetState.  We start with the solver defaults (lowest
-    // precedence); then, merge in any NetState specified by the net_param
-    // itself; finally, merge in any NetState specified by the test_state
-    // (highest precedence).
-    NetState net_state;
-    net_state.set_phase(TEST);
-    net_state.MergeFrom(net_params[i].state());
-    if (param_.test_state_size()) {
-      net_state.MergeFrom(param_.test_state(i));
-    }
-    net_params[i].mutable_state()->CopyFrom(net_state);
-    LOG(INFO)
-        << "Creating test net (#" << i << ") specified by " << sources[i];
-    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-    test_nets_[i]->set_debug_info(param_.debug_info());
-  }
+	const bool has_net_param = param_.has_net_param();
+	const bool has_net_file = param_.has_net();
+	const int num_generic_nets = has_net_param + has_net_file;
+	CHECK_LE(num_generic_nets, 1)
+		<< "Both net_param and net_file may not be specified.";
+	const int num_test_net_params = param_.test_net_param_size();
+	const int num_test_net_files = param_.test_net_size();
+	const int num_test_nets = num_test_net_params + num_test_net_files;
+	if (num_generic_nets) {
+		CHECK_GE(param_.test_iter_size(), num_test_nets)
+			<< "test_iter must be specified for each test network.";
+	} else {
+		CHECK_EQ(param_.test_iter_size(), num_test_nets)
+			<< "test_iter must be specified for each test network.";
+	}
+	// If we have a generic net (specified by net or net_param, rather than
+	// test_net or test_net_param), we may have an unlimited number of actual
+	// test networks -- the actual number is given by the number of remaining
+	// test_iters after any test nets specified by test_net_param and/or test_net
+	// are evaluated.
+	const int num_generic_net_instances = param_.test_iter_size() - num_test_nets;
+	const int num_test_net_instances = num_test_nets + num_generic_net_instances;
+	if (param_.test_state_size()) {
+		CHECK_EQ(param_.test_state_size(), num_test_net_instances)
+			<< "test_state must be unspecified or specified once per test net.";
+	}
+	if (num_test_net_instances) {
+		CHECK_GT(param_.test_interval(), 0);
+	}
+	int test_net_id = 0;
+	vector < string > sources(num_test_net_instances);
+	vector < NetParameter > net_params(num_test_net_instances);
+	for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
+		sources[test_net_id] = "test_net_param";
+		net_params[test_net_id].CopyFrom(param_.test_net_param(i));
+	}
+	for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
+		sources[test_net_id] = "test_net file: " + param_.test_net(i);
+		ReadNetParamsFromTextFileOrDie(param_.test_net(i),
+			&net_params[test_net_id]);
+	}
+	const int remaining_test_nets = param_.test_iter_size() - test_net_id;
+	if (has_net_param) {
+		for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
+			sources[test_net_id] = "net_param";
+			net_params[test_net_id].CopyFrom(param_.net_param());
+		}
+	}
+	if (has_net_file) {
+		for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
+			sources[test_net_id] = "net file: " + param_.net();
+			ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]);
+		}
+	}
+	test_nets_.resize(num_test_net_instances);
+	for (int i = 0; i < num_test_net_instances; ++i) {
+		// Set the correct NetState.  We start with the solver defaults (lowest
+		// precedence); then, merge in any NetState specified by the net_param
+		// itself; finally, merge in any NetState specified by the test_state
+		// (highest precedence).
+		NetState net_state;
+		net_state.set_phase(TEST);
+		net_state.MergeFrom(net_params[i].state());
+		if (param_.test_state_size()) {
+			net_state.MergeFrom(param_.test_state(i));
+		}
+		net_params[i].mutable_state()->CopyFrom(net_state);
+		LOG(INFO)
+			<< "Creating test net (#" << i << ") specified by " << sources[i];
+		test_nets_[i].reset(new Net<Dtype>(net_params[i]));
+		test_nets_[i]->set_debug_info(param_.debug_info());
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Step(int iters) {
-  vector<Blob<Dtype>*> bottom_vec;
-  const int start_iter = iter_;
-  const int stop_iter = iter_ + iters;
-  int average_loss = this->param_.average_loss();
-  vector<Dtype> losses;
-  Dtype smoothed_loss = 0;
-
-  while (iter_ < stop_iter) {
-    // zero-init the params
-    for (int i = 0; i < net_->params().size(); ++i) {
-      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
-      switch (Caffe::mode()) {
-      case Caffe::CPU:
-        caffe_set(blob->count(), static_cast<Dtype>(0),
-            blob->mutable_cpu_diff());
-        break;
-      case Caffe::GPU:
-#ifndef CPU_ONLY
-        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-            blob->mutable_gpu_diff());
+	vector<Blob<Dtype>*> bottom_vec;
+	const int start_iter = iter_;
+	const int stop_iter = iter_ + iters;
+	int average_loss = this->param_.average_loss();
+	vector < Dtype > losses;
+	Dtype smoothed_loss = 0;
+
+	while (iter_ < stop_iter) {
+		// zero-init the params
+		for (int i = 0; i < net_->params().size(); ++i) {
+			shared_ptr < Blob<Dtype> > blob = net_->params()[i];
+			switch (Caffe::mode()) {
+				case Caffe::CPU:
+					caffe_set(blob->count(), static_cast<Dtype>(0),
+						blob->mutable_cpu_diff());
+					break;
+				case Caffe::GPU:
+					#ifndef CPU_ONLY
+					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+						blob->mutable_gpu_diff());
 #else
-        NO_GPU;
+					NO_GPU;
 #endif
-      case Caffe::APU:
-#ifndef CPU_ONLY
-        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-            blob->mutable_gpu_diff());
+				case Caffe::APU:
+					#ifndef CPU_ONLY
+					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+						blob->mutable_gpu_diff());
 #else
-        NO_GPU;
+					NO_GPU;
 #endif
-        break;
-      }
-    }
-
-    if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())) {
-      TestAll();
-    }
-
-    const bool display = param_.display() && iter_ % param_.display() == 0;
-    net_->set_debug_info(display && param_.debug_info());
-    // accumulate the loss and gradient
-    Dtype loss = 0;
-    for (int i = 0; i < param_.iter_size(); ++i) {
-      loss += net_->ForwardBackward(bottom_vec);
-    }
-    loss /= param_.iter_size();
-    // average the loss across iterations for smoothed reporting
-    if (losses.size() < average_loss) {
-      losses.push_back(loss);
-      int size = losses.size();
-      smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
-    } else {
-      int idx = (iter_ - start_iter) % average_loss;
-      smoothed_loss += (loss - losses[idx]) / average_loss;
-      losses[idx] = loss;
-      printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss, losses[idx], idx);
-    }
-      printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n", smoothed_loss,average_loss, losses.size());
-    if (display) {
-      LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
-      const vector<Blob<Dtype>*>& result = net_->output_blobs();
-      int score_index = 0;
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        const string& output_name =
-            net_->blob_names()[net_->output_blob_indices()[j]];
-        const Dtype loss_weight =
-            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
-        for (int k = 0; k < result[j]->count(); ++k) {
-          ostringstream loss_msg_stream;
-          if (loss_weight) {
-            loss_msg_stream << " (* " << loss_weight
-                            << " = " << loss_weight * result_vec[k] << " loss)";
-          }
-          LOG(INFO) << "    Train net output #"
-              << score_index++ << ": " << output_name << " = "
-              << result_vec[k] << loss_msg_stream.str();
-        }
-      }
-    }
-    ApplyUpdate();
-
-    // Increment the internal iter_ counter -- its value should always indicate
-    // the number of times the weights have been updated.
-    ++iter_;
-
-    // Save a snapshot if needed.
-    if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
-      Snapshot();
-    }
-  }
+					break;
+			}
+		}
+
+		if (param_.test_interval() && iter_ % param_.test_interval() == 0
+			&& (iter_ > 0 || param_.test_initialization())) {
+			TestAll();
+		}
+
+		const bool display = param_.display() && iter_ % param_.display() == 0;
+		net_->set_debug_info(display && param_.debug_info());
+		// accumulate the loss and gradient
+		Dtype loss = 0;
+		for (int i = 0; i < param_.iter_size(); ++i) {
+			loss += net_->ForwardBackward(bottom_vec);
+		}
+		loss /= param_.iter_size();
+		// average the loss across iterations for smoothed reporting
+		if (losses.size() < average_loss) {
+			losses.push_back(loss);
+			int size = losses.size();
+			smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
+		} else {
+			int idx = (iter_ - start_iter) % average_loss;
+			smoothed_loss += (loss - losses[idx]) / average_loss;
+			losses[idx] = loss;
+			printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss,
+				losses[idx], idx);
+		}
+		printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n",
+			smoothed_loss, average_loss, losses.size());
+		if (display) {
+			LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
+			const vector<Blob<Dtype>*>& result = net_->output_blobs();
+			int score_index = 0;
+			for (int j = 0; j < result.size(); ++j) {
+				const Dtype* result_vec = result[j]->cpu_data();
+				const string& output_name =
+					net_->blob_names()[net_->output_blob_indices()[j]];
+				const Dtype loss_weight =
+					net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+				for (int k = 0; k < result[j]->count(); ++k) {
+					ostringstream loss_msg_stream;
+					if (loss_weight) {
+						loss_msg_stream << " (* " << loss_weight
+							<< " = " << loss_weight * result_vec[k] << " loss)";
+					}
+					LOG(INFO) << "    Train net output #"
+						<< score_index++ << ": " << output_name << " = "
+						<< result_vec[k] << loss_msg_stream.str();
+				}
+			}
+		}
+		ApplyUpdate();
+
+		// Increment the internal iter_ counter -- its value should always indicate
+		// the number of times the weights have been updated.
+		++iter_;
+
+		// Save a snapshot if needed.
+		if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
+			Snapshot();
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Solve(const char* resume_file) {
-  LOG(INFO) << "Solving " << net_->name();
-  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
-
-  if (resume_file) {
-    LOG(INFO) << "Restoring previous solver status from " << resume_file;
-    Restore(resume_file);
-  }
-
-  // For a network that is trained by the solver, no bottom or top vecs
-  // should be given, and we will just provide dummy vecs.
-  Step(param_.max_iter() - iter_);
-  // If we haven't already, save a snapshot after optimization, unless
-  // overridden by setting snapshot_after_train := false
-  if (param_.snapshot_after_train()
-      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
-    Snapshot();
-  }
-  // After the optimization is done, run an additional train and test pass to
-  // display the train and test loss/outputs if appropriate (based on the
-  // display and test_interval settings, respectively).  Unlike in the rest of
-  // training, for the train net we only run a forward pass as we've already
-  // updated the parameters "max_iter" times -- this final pass is only done to
-  // display the loss, which is computed in the forward pass.
-  if (param_.display() && iter_ % param_.display() == 0) {
-    Dtype loss;
-    net_->ForwardPrefilled(&loss);
-    LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
-  }
-  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
-    TestAll();
-  }
-  LOG(INFO) << "Optimization Done.";
+	LOG(INFO) << "Solving " << net_->name();
+	LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
+
+	if (resume_file) {
+		LOG(INFO) << "Restoring previous solver status from " << resume_file;
+		Restore(resume_file);
+	}
+
+	// For a network that is trained by the solver, no bottom or top vecs
+	// should be given, and we will just provide dummy vecs.
+	Step(param_.max_iter() - iter_);
+	// If we haven't already, save a snapshot after optimization, unless
+	// overridden by setting snapshot_after_train := false
+	if (param_.snapshot_after_train()
+		&& (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
+		Snapshot();
+	}
+	// After the optimization is done, run an additional train and test pass to
+	// display the train and test loss/outputs if appropriate (based on the
+	// display and test_interval settings, respectively).  Unlike in the rest of
+	// training, for the train net we only run a forward pass as we've already
+	// updated the parameters "max_iter" times -- this final pass is only done to
+	// display the loss, which is computed in the forward pass.
+	if (param_.display() && iter_ % param_.display() == 0) {
+		Dtype loss;
+		net_->ForwardPrefilled(&loss);
+		LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
+	}
+	if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
+		TestAll();
+	}
+	LOG(INFO) << "Optimization Done.";
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::TestAll() {
-  for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
-    Test(test_net_id);
-  }
+	for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
+		Test(test_net_id);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
-  LOG(INFO) << "Iteration " << iter_
-            << ", Testing net (#" << test_net_id << ")";
-  CHECK_NOTNULL(test_nets_[test_net_id].get())->
-      ShareTrainedLayersWith(net_.get());
-  vector<Dtype> test_score;
-  vector<int> test_score_output_id;
-  vector<Blob<Dtype>*> bottom_vec;
-  const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
-  Dtype loss = 0;
-  for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
-    Dtype iter_loss;
-    const vector<Blob<Dtype>*>& result =
-        test_net->Forward(bottom_vec, &iter_loss);
-    if (param_.test_compute_loss()) {
-      loss += iter_loss;
-    }
-    if (i == 0) {
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        for (int k = 0; k < result[j]->count(); ++k) {
-          test_score.push_back(result_vec[k]);
-          test_score_output_id.push_back(j);
-        }
-      }
-    } else {
-      int idx = 0;
-      for (int j = 0; j < result.size(); ++j) {
-        const Dtype* result_vec = result[j]->cpu_data();
-        for (int k = 0; k < result[j]->count(); ++k) {
-          test_score[idx++] += result_vec[k];
-        }
-      }
-    }
-  }
-  if (param_.test_compute_loss()) {
-    loss /= param_.test_iter(test_net_id);
-    LOG(INFO) << "Test loss: " << loss;
-  }
-  for (int i = 0; i < test_score.size(); ++i) {
-    const int output_blob_index =
-        test_net->output_blob_indices()[test_score_output_id[i]];
-    const string& output_name = test_net->blob_names()[output_blob_index];
-    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
-    ostringstream loss_msg_stream;
-    const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
-    if (loss_weight) {
-      loss_msg_stream << " (* " << loss_weight
-                      << " = " << loss_weight * mean_score << " loss)";
-    }
-    LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-        << mean_score << loss_msg_stream.str();
-  }
+	LOG(INFO) << "Iteration " << iter_
+		<< ", Testing net (#" << test_net_id << ")";
+	CHECK_NOTNULL(test_nets_[test_net_id].get())->
+		ShareTrainedLayersWith(net_.get());
+	vector < Dtype > test_score;
+	vector<int> test_score_output_id;
+	vector<Blob<Dtype>*> bottom_vec;
+	const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
+	Dtype loss = 0;
+	for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
+		Dtype iter_loss;
+		const vector<Blob<Dtype>*>& result =
+			test_net->Forward(bottom_vec, &iter_loss);
+		if (param_.test_compute_loss()) {
+			loss += iter_loss;
+		}
+		if (i == 0) {
+			for (int j = 0; j < result.size(); ++j) {
+				const Dtype* result_vec = result[j]->cpu_data();
+				for (int k = 0; k < result[j]->count(); ++k) {
+					test_score.push_back(result_vec[k]);
+					test_score_output_id.push_back(j);
+				}
+			}
+		} else {
+			int idx = 0;
+			for (int j = 0; j < result.size(); ++j) {
+				const Dtype* result_vec = result[j]->cpu_data();
+				for (int k = 0; k < result[j]->count(); ++k) {
+					test_score[idx++] += result_vec[k];
+				}
+			}
+		}
+	}
+	if (param_.test_compute_loss()) {
+		loss /= param_.test_iter(test_net_id);
+		LOG(INFO) << "Test loss: " << loss;
+	}
+	for (int i = 0; i < test_score.size(); ++i) {
+		const int output_blob_index =
+			test_net->output_blob_indices()[test_score_output_id[i]];
+		const string& output_name = test_net->blob_names()[output_blob_index];
+		const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
+		ostringstream loss_msg_stream;
+		const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
+		if (loss_weight) {
+			loss_msg_stream << " (* " << loss_weight
+				<< " = " << loss_weight * mean_score << " loss)";
+		}
+		LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
+			<< mean_score << loss_msg_stream.str();
+	}
 }
 
-
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Snapshot() {
-  NetParameter net_param;
-  // For intermediate results, we will also dump the gradient values.
-  net_->ToProto(&net_param, param_.snapshot_diff());
-  string filename(param_.snapshot_prefix());
-  string model_filename, snapshot_filename;
-  const int kBufferSize = 20;
-  char iter_str_buffer[kBufferSize];
-  snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
-  filename += iter_str_buffer;
-  model_filename = filename + ".caffemodel";
-  LOG(INFO) << "Snapshotting to " << model_filename;
-  WriteProtoToBinaryFile(net_param, model_filename.c_str());
-  SolverState state;
-  SnapshotSolverState(&state);
-  state.set_iter(iter_);
-  state.set_learned_net(model_filename);
-  state.set_current_step(current_step_);
-  snapshot_filename = filename + ".solverstate";
-  LOG(INFO) << "Snapshotting solver state to " << snapshot_filename;
-  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
+	NetParameter net_param;
+	// For intermediate results, we will also dump the gradient values.
+	net_->ToProto(&net_param, param_.snapshot_diff());
+	string filename(param_.snapshot_prefix());
+	string model_filename, snapshot_filename;
+	const int kBufferSize = 20;
+	char iter_str_buffer[kBufferSize];
+	snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
+	filename += iter_str_buffer;
+	model_filename = filename + ".caffemodel";
+	LOG(INFO) << "Snapshotting to " << model_filename;
+	WriteProtoToBinaryFile(net_param, model_filename.c_str());
+	SolverState state;
+	SnapshotSolverState(&state);
+	state.set_iter(iter_);
+	state.set_learned_net(model_filename);
+	state.set_current_step(current_step_);
+	snapshot_filename = filename + ".solverstate";
+	LOG(INFO) << "Snapshotting solver state to " << snapshot_filename;
+	WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
-  SolverState state;
-  NetParameter net_param;
-  ReadProtoFromBinaryFile(state_file, &state);
-  if (state.has_learned_net()) {
-    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
-    net_->CopyTrainedLayersFrom(net_param);
-  }
-  iter_ = state.iter();
-  current_step_ = state.current_step();
-  RestoreSolverState(state);
+	SolverState state;
+	NetParameter net_param;
+	ReadProtoFromBinaryFile(state_file, &state);
+	if (state.has_learned_net()) {
+		ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
+		net_->CopyTrainedLayersFrom(net_param);
+	}
+	iter_ = state.iter();
+	current_step_ = state.current_step();
+	RestoreSolverState(state);
 }
 
-
 // Return the current learning rate. The currently implemented learning rate
 // policies are as follows:
 //    - fixed: always return base_lr.
@@ -420,385 +419,389 @@ void Solver<Dtype>::Restore(const char* state_file) {
 //
 // where base_lr, max_iter, gamma, step, stepvalue and power are defined
 // in the solver parameter protocol buffer, and iter is the current iteration.
-template <typename Dtype>
+template<typename Dtype>
 Dtype SGDSolver<Dtype>::GetLearningRate() {
-  Dtype rate;
-  const string& lr_policy = this->param_.lr_policy();
-  if (lr_policy == "fixed") {
-    rate = this->param_.base_lr();
-  } else if (lr_policy == "step") {
-    this->current_step_ = this->iter_ / this->param_.stepsize();
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "exp") {
-    rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
-  } else if (lr_policy == "inv") {
-    rate = this->param_.base_lr() *
-        pow(Dtype(1) + this->param_.gamma() * this->iter_,
-            - this->param_.power());
-  } else if (lr_policy == "multistep") {
-    if (this->current_step_ < this->param_.stepvalue_size() &&
-          this->iter_ >= this->param_.stepvalue(this->current_step_)) {
-      this->current_step_++;
-      LOG(INFO) << "MultiStep Status: Iteration " <<
-      this->iter_ << ", step = " << this->current_step_;
-    }
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
-  } else if (lr_policy == "poly") {
-    rate = this->param_.base_lr() * pow(Dtype(1.) -
-        (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-        this->param_.power());
-  } else if (lr_policy == "sigmoid") {
-    rate = this->param_.base_lr() * (Dtype(1.) /
-        (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-          Dtype(this->param_.stepsize())))));
-  } else {
-    LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
-  }
-  return rate;
+	Dtype rate;
+	const string& lr_policy = this->param_.lr_policy();
+	if (lr_policy == "fixed") {
+		rate = this->param_.base_lr();
+	} else if (lr_policy == "step") {
+		this->current_step_ = this->iter_ / this->param_.stepsize();
+		rate = this->param_.base_lr() *
+			pow(this->param_.gamma(), this->current_step_);
+	} else if (lr_policy == "exp") {
+		rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
+	} else if (lr_policy == "inv") {
+		rate = this->param_.base_lr() *
+			pow(Dtype(1) + this->param_.gamma() * this->iter_,
+				-this->param_.power());
+	} else if (lr_policy == "multistep") {
+		if (this->current_step_ < this->param_.stepvalue_size() &&
+			this->iter_ >= this->param_.stepvalue(this->current_step_)) {
+			this->current_step_++;
+			LOG(INFO) << "MultiStep Status: Iteration " <<
+				this->iter_ << ", step = " << this->current_step_;
+		}
+		rate = this->param_.base_lr() *
+			pow(this->param_.gamma(), this->current_step_);
+	} else if (lr_policy == "poly") {
+		rate = this->param_.base_lr() * pow(Dtype(1.) -
+			(Dtype(this->iter_) / Dtype(this->param_.max_iter())),
+			this->param_.power());
+	} else if (lr_policy == "sigmoid") {
+		rate = this->param_.base_lr() * (Dtype(1.) /
+			(Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
+				Dtype(this->param_.stepsize())))));
+	} else {
+		LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
+	}
+	return rate;
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::PreSolve() {
-  // Initialize the history
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  history_.clear();
-  update_.clear();
-  temp_.clear();
-  for (int i = 0; i < net_params.size(); ++i) {
-    const vector<int>& shape = net_params[i]->shape();
-    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-  }
+	// Initialize the history
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	history_.clear();
+	update_.clear();
+	temp_.clear();
+	for (int i = 0; i < net_params.size(); ++i) {
+		const vector<int>& shape = net_params[i]->shape();
+		history_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+		update_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+		temp_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::ClipGradients() {
-  const Dtype clip_gradients = this->param_.clip_gradients();
-  if (clip_gradients < 0) { return; }
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  Dtype sumsq_diff = 0;
-  for (int i = 0; i < net_params.size(); ++i) {
-    if (this->net_->param_owners()[i] < 0) {
-      sumsq_diff += net_params[i]->sumsq_diff();
-    }
-  }
-  const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-  if (l2norm_diff > clip_gradients) {
-    Dtype scale_factor = clip_gradients / l2norm_diff;
-    LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-        << l2norm_diff << " > " << clip_gradients << ") "
-        << "by scale factor " << scale_factor;
-    for (int i = 0; i < net_params.size(); ++i) {
-      if (this->net_->param_owners()[i] < 0) {
-        net_params[i]->scale_diff(scale_factor);
-      }
-    }
-  }
+	const Dtype clip_gradients = this->param_.clip_gradients();
+	if (clip_gradients < 0) {
+		return;
+	}
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	Dtype sumsq_diff = 0;
+	for (int i = 0; i < net_params.size(); ++i) {
+		if (this->net_->param_owners()[i] < 0) {
+			sumsq_diff += net_params[i]->sumsq_diff();
+		}
+	}
+	const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+	if (l2norm_diff > clip_gradients) {
+		Dtype scale_factor = clip_gradients / l2norm_diff;
+		LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
+			<< l2norm_diff << " > " << clip_gradients << ") "
+			<< "by scale factor " << scale_factor;
+		for (int i = 0; i < net_params.size(); ++i) {
+			if (this->net_->param_owners()[i] < 0) {
+				net_params[i]->scale_diff(scale_factor);
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
-  Dtype rate = GetLearningRate();
-  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
-  }
-  ClipGradients();
-  for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
-    Normalize(param_id);
-    Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
-  }
-  this->net_->Update();
+	Dtype rate = GetLearningRate();
+	if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+		LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+	}
+	ClipGradients();
+	for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
+		Normalize(param_id);
+		Regularize(param_id);
+		ComputeUpdateValue(param_id, rate);
+	}
+	this->net_->Update();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
-  if (this->param_.iter_size() == 1) { return; }
-  // Scale gradient to counterbalance accumulation.
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
+	if (this->param_.iter_size() == 1) {
+		return;
+	}
+	// Scale gradient to counterbalance accumulation.
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+	switch (Caffe::mode()) {
+		case Caffe::CPU: {
+			caffe_scal(net_params[param_id]->count(), accum_normalization,
+				net_params[param_id]->mutable_cpu_diff());
+			break;
+		}
+		case Caffe::GPU: {
 #ifndef CPU_ONLY
-    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
-        net_params[param_id]->mutable_gpu_diff());
+			caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
+				net_params[param_id]->mutable_gpu_diff());
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
+			break;
+		}
+		default:
+			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::Regularize(int param_id) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_weight_decay =
-      this->net_->params_weight_decay();
-  Dtype weight_decay = this->param_.weight_decay();
-  string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
- 
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_cpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->cpu_data(),
-            temp_[param_id]->mutable_cpu_data());
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->cpu_data(),
-            net_params[param_id]->mutable_cpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
-    break;
-  }
-  case Caffe::GPU: {
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	const vector<float>& net_params_weight_decay =
+		this->net_->params_weight_decay();
+	Dtype weight_decay = this->param_.weight_decay();
+	string regularization_type = this->param_.regularization_type();
+	Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+	switch (Caffe::mode()) {
+		case Caffe::CPU: {
+			if (local_decay) {
+				if (regularization_type == "L2") {
+					// add weight decay
+					caffe_axpy(net_params[param_id]->count(),
+						local_decay,
+						net_params[param_id]->cpu_data(),
+						net_params[param_id]->mutable_cpu_diff());
+				} else if (regularization_type == "L1") {
+					caffe_cpu_sign(net_params[param_id]->count(),
+						net_params[param_id]->cpu_data(),
+						temp_[param_id]->mutable_cpu_data());
+					caffe_axpy(net_params[param_id]->count(),
+						local_decay,
+						temp_[param_id]->cpu_data(),
+						net_params[param_id]->mutable_cpu_diff());
+				} else {
+					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+				}
+			}
+			break;
+		}
+		case Caffe::GPU: {
 #ifndef CPU_ONLY
-    if (local_decay) {
-      if (regularization_type == "L2") {
-        // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            net_params[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else if (regularization_type == "L1") {
-        caffe_gpu_sign(net_params[param_id]->count(),
-            net_params[param_id]->gpu_data(),
-            temp_[param_id]->mutable_gpu_data());
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
-            temp_[param_id]->gpu_data(),
-            net_params[param_id]->mutable_gpu_diff());
-      } else {
-        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-      }
-    }
+			if (local_decay) {
+				if (regularization_type == "L2") {
+					// add weight decay
+					caffe_gpu_axpy(net_params[param_id]->count(),
+						local_decay,
+						net_params[param_id]->gpu_data(),
+						net_params[param_id]->mutable_gpu_diff());
+				} else if (regularization_type == "L1") {
+					caffe_gpu_sign(net_params[param_id]->count(),
+						net_params[param_id]->gpu_data(),
+						temp_[param_id]->mutable_gpu_data());
+					caffe_gpu_axpy(net_params[param_id]->count(),
+						local_decay,
+						temp_[param_id]->gpu_data(),
+						net_params[param_id]->mutable_gpu_diff());
+				} else {
+					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+				}
+			}
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
+			break;
+		}
+		default:
+			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  // Compute the update to history, then copy it to the parameter diff.
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              history_[param_id]->mutable_cpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	const vector<float>& net_params_lr = this->net_->params_lr();
+	Dtype momentum = this->param_.momentum();
+	Dtype local_rate = rate * net_params_lr[param_id];
+	// Compute the update to history, then copy it to the parameter diff.
+	switch (Caffe::mode()) {
+		case Caffe::CPU: {
+			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+				net_params[param_id]->cpu_diff(), momentum,
+				history_[param_id]->mutable_cpu_data());
+			caffe_copy(net_params[param_id]->count(),
+				history_[param_id]->cpu_data(),
+				net_params[param_id]->mutable_cpu_diff());
+			break;
+		}
+		case Caffe::GPU: {
 #ifndef CPU_ONLY
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              history_[param_id]->mutable_gpu_data());
-    caffe_gpu_copy(net_params[param_id]->count(),
-        history_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+				net_params[param_id]->gpu_diff(), momentum,
+				history_[param_id]->mutable_gpu_data());
+			caffe_gpu_copy(net_params[param_id]->count(),
+				history_[param_id]->gpu_data(),
+				net_params[param_id]->mutable_gpu_diff());
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
+			break;
+		}
+		default:
+			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
-  state->clear_history();
-  for (int i = 0; i < history_.size(); ++i) {
-    // Add history
-    BlobProto* history_blob = state->add_history();
-    history_[i]->ToProto(history_blob);
-  }
+	state->clear_history();
+	for (int i = 0; i < history_.size(); ++i) {
+		// Add history
+		BlobProto* history_blob = state->add_history();
+		history_[i]->ToProto(history_blob);
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
-  CHECK_EQ(state.history_size(), history_.size())
-      << "Incorrect length of history blobs.";
-  LOG(INFO) << "SGDSolver: restoring history";
-  for (int i = 0; i < history_.size(); ++i) {
-    history_[i]->FromProto(state.history(i));
-  }
+	CHECK_EQ(state.history_size(), history_.size())
+		<< "Incorrect length of history blobs.";
+	LOG(INFO) << "SGDSolver: restoring history";
+	for (int i = 0; i < history_.size(); ++i) {
+		history_[i]->FromProto(state.history(i));
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype momentum = this->param_.momentum();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->cpu_data(),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              this->history_[param_id]->mutable_cpu_data());
-
-    // compute update: step back then over step
-    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->cpu_data(), -momentum,
-        this->update_[param_id]->mutable_cpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	const vector<float>& net_params_lr = this->net_->params_lr();
+	Dtype momentum = this->param_.momentum();
+	Dtype local_rate = rate * net_params_lr[param_id];
+	switch (Caffe::mode()) {
+		case Caffe::CPU: {
+			// save history momentum for stepping back
+			caffe_copy(net_params[param_id]->count(),
+				this->history_[param_id]->cpu_data(),
+				this->update_[param_id]->mutable_cpu_data());
+
+			// update history
+			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+				net_params[param_id]->cpu_diff(), momentum,
+				this->history_[param_id]->mutable_cpu_data());
+
+			// compute update: step back then over step
+			caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+				this->history_[param_id]->cpu_data(), -momentum,
+				this->update_[param_id]->mutable_cpu_data());
+
+			// copy
+			caffe_copy(net_params[param_id]->count(),
+				this->update_[param_id]->cpu_data(),
+				net_params[param_id]->mutable_cpu_diff());
+			break;
+		}
+		case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              this->history_[param_id]->mutable_gpu_data());
-
-    // compute update: step back then over step
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->gpu_data(), -momentum,
-        this->update_[param_id]->mutable_gpu_data());
-
-    // copy
-    caffe_gpu_copy(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+			// save history momentum for stepping back
+			caffe_copy(net_params[param_id]->count(),
+				this->history_[param_id]->gpu_data(),
+				this->update_[param_id]->mutable_gpu_data());
+
+			// update history
+			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+				net_params[param_id]->gpu_diff(), momentum,
+				this->history_[param_id]->mutable_gpu_data());
+
+			// compute update: step back then over step
+			caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+				this->history_[param_id]->gpu_data(), -momentum,
+				this->update_[param_id]->mutable_gpu_data());
+
+			// copy
+			caffe_gpu_copy(net_params[param_id]->count(),
+				this->update_[param_id]->gpu_data(),
+				net_params[param_id]->mutable_gpu_diff());
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
+			break;
+		}
+		default:
+			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-  const vector<float>& net_params_lr = this->net_->params_lr();
-  Dtype delta = this->param_.delta();
-  Dtype local_rate = rate * net_params_lr[param_id];
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
-
-    // update history
-    caffe_add(net_params[param_id]->count(),
-        this->update_[param_id]->cpu_data(),
-        this->history_[param_id]->cpu_data(),
-        this->history_[param_id]->mutable_cpu_data());
-
-    // prepare update
-    caffe_powx(net_params[param_id]->count(),
-              this->history_[param_id]->cpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_cpu_data());
-
-    caffe_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_cpu_data());
-
-    caffe_div(net_params[param_id]->count(),
-              net_params[param_id]->cpu_diff(),
-              this->update_[param_id]->cpu_data(),
-              this->update_[param_id]->mutable_cpu_data());
-
-    // scale and copy
-    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->cpu_data(), Dtype(0),
-        net_params[param_id]->mutable_cpu_diff());
-    break;
-  }
-  case Caffe::GPU: {
+	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+	const vector<float>& net_params_lr = this->net_->params_lr();
+	Dtype delta = this->param_.delta();
+	Dtype local_rate = rate * net_params_lr[param_id];
+	switch (Caffe::mode()) {
+		case Caffe::CPU: {
+			// compute square of gradient in update
+			caffe_powx(net_params[param_id]->count(),
+				net_params[param_id]->cpu_diff(), Dtype(2),
+				this->update_[param_id]->mutable_cpu_data());
+
+			// update history
+			caffe_add(net_params[param_id]->count(),
+				this->update_[param_id]->cpu_data(),
+				this->history_[param_id]->cpu_data(),
+				this->history_[param_id]->mutable_cpu_data());
+
+			// prepare update
+			caffe_powx(net_params[param_id]->count(),
+				this->history_[param_id]->cpu_data(), Dtype(0.5),
+				this->update_[param_id]->mutable_cpu_data());
+
+			caffe_add_scalar(net_params[param_id]->count(),
+				delta, this->update_[param_id]->mutable_cpu_data());
+
+			caffe_div(net_params[param_id]->count(),
+				net_params[param_id]->cpu_diff(),
+				this->update_[param_id]->cpu_data(),
+				this->update_[param_id]->mutable_cpu_data());
+
+			// scale and copy
+			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+				this->update_[param_id]->cpu_data(), Dtype(0),
+				net_params[param_id]->mutable_cpu_diff());
+			break;
+		}
+		case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->history_[param_id]->mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx( net_params[param_id]->count(),
-              this->history_[param_id]->gpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar<Dtype>(net_params[param_id]->count(),
-             delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-              net_params[param_id]->gpu_diff(),
-              this->update_[param_id]->gpu_data(),
-              this->update_[param_id]->mutable_gpu_data());
-
-    // scale and copy
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
+			// compute square of gradient in update
+			caffe_gpu_powx(net_params[param_id]->count(),
+				net_params[param_id]->gpu_diff(), Dtype(2),
+				this->update_[param_id]->mutable_gpu_data());
+
+			// update history
+			caffe_gpu_add(net_params[param_id]->count(),
+				this->update_[param_id]->gpu_data(),
+				this->history_[param_id]->gpu_data(),
+				this->history_[param_id]->mutable_gpu_data());
+
+			// prepare update
+			caffe_gpu_powx(net_params[param_id]->count(),
+				this->history_[param_id]->gpu_data(), Dtype(0.5),
+				this->update_[param_id]->mutable_gpu_data());
+
+			caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(),
+				delta, this->update_[param_id]->mutable_gpu_data());
+
+			caffe_gpu_div(net_params[param_id]->count(),
+				net_params[param_id]->gpu_diff(),
+				this->update_[param_id]->gpu_data(),
+				this->update_[param_id]->mutable_gpu_data());
+
+			// scale and copy
+			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+				this->update_[param_id]->gpu_data(), Dtype(0),
+				net_params[param_id]->mutable_gpu_diff());
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-  }
+			break;
+		}
+		default:
+			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+	}
 }
 
-INSTANTIATE_CLASS(Solver);
-INSTANTIATE_CLASS(SGDSolver);
-INSTANTIATE_CLASS(NesterovSolver);
-INSTANTIATE_CLASS(AdaGradSolver);
+INSTANTIATE_CLASS (Solver);
+INSTANTIATE_CLASS (SGDSolver);
+INSTANTIATE_CLASS (NesterovSolver);
+INSTANTIATE_CLASS (AdaGradSolver);
 
 }  // namespace caffe
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 8cf9bc7b..94d62e0e 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -36,143 +36,156 @@
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
-if (cpu_ptr_ && own_cpu_data_) {
-    OCL_CHECK( clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, cpu_ptr_, 0, NULL, NULL) );
-    clFinish(amdDevice.CommandQueue);
-  }
-  if(gpu_cache_ptr_ && own_cpu_data_)  {
-    OCL_CHECK( clReleaseMemObject((cl_mem)gpu_cache_ptr_) );
-  }
-  if (gpu_ptr_) {
-    OCL_CHECK( clReleaseMemObject((cl_mem)gpu_ptr_) );
-  }
-
-  clReleaseKernel(oclmem_kernel);
-}	
+	if (cpu_ptr_ && own_cpu_data_) {
+		OCL_CHECK(
+			clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+				cpu_ptr_, 0, NULL, NULL));
+		clFinish(amdDevice.CommandQueue);
+	}
+	if (gpu_cache_ptr_ && own_cpu_data_) {
+		OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_));
+	}
+	if (gpu_ptr_) {
+		OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_));
+	}
+
+	clReleaseKernel (oclmem_kernel);
+}
 
 void SyncedMemory::ocl_setup() {
-  cl_int err=0;
-  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
-  OCL_CHECK(err);
+	cl_int err = 0;
+	oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+	OCL_CHECK(err);
 }
 
 inline void SyncedMemory::to_cpu() {
-switch (head_) {
-  case UNINITIALIZED:
-    gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
-    //}
-    cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
-    memset(cpu_ptr_, 0, size_);
-    head_ = HEAD_AT_CPU;
-    own_cpu_data_ = true;
-    break;
-  case HEAD_AT_GPU:{
+	switch (head_) {
+		case UNINITIALIZED:
+			gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+				size_, NULL, NULL);
+			//}
+			cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+				(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_,
+				0, NULL, NULL, NULL);
+			memset(cpu_ptr_, 0, size_);
+			head_ = HEAD_AT_CPU;
+			own_cpu_data_ = true;
+			break;
+		case HEAD_AT_GPU: {
 #ifndef CPU_ONLY
-    if (cpu_ptr_ == NULL) {
-      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
-      cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, 0, NULL, NULL, NULL);
-      own_cpu_data_ = true;
-    }
-    OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_ptr_, (cl_mem)gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
-    clFinish(amdDevice.CommandQueue);
-    head_ = SYNCED;
+			if (cpu_ptr_ == NULL) {
+				gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context,
+					CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
+				cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+					(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+					size_, 0, NULL, NULL, NULL);
+				own_cpu_data_ = true;
+			}
+			OCL_CHECK(
+				clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
+					(cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
+			clFinish(amdDevice.CommandQueue);
+			head_ = SYNCED;
 #else
-    NO_GPU;
+			NO_GPU;
 #endif
 #ifdef Track_data_transfer
-    LOG(WARNING) << "sync: data from GPU to CPU";
+			LOG(WARNING) << "sync: data from GPU to CPU";
 #endif
-    break;
-  }
-  case HEAD_AT_CPU:
-  case SYNCED:
-    break;
-  }
+			break;
+		}
+		case HEAD_AT_CPU:
+			case SYNCED:
+			break;
+	}
 }
 
 inline void SyncedMemory::to_gpu() {
 #ifndef CPU_ONLY
-switch (head_) {
-  case UNINITIALIZED:{
-    cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL);
-    if(NULL == tmpMem){
-      fprintf(stderr,"Failed to create memory object\n");
-      break;
-    }
-    ocl_memset(oclmem_kernel, tmpMem, (int)0, (int)(size_/sizeof(int)));
-    gpu_ptr_ = (void*)tmpMem;
-    head_ = HEAD_AT_GPU;
-    break;
-  }
-  case HEAD_AT_CPU:{
-    if (gpu_ptr_ == NULL) {
-      cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, NULL, NULL);
-      if(NULL == tmpMem){
-        fprintf(stderr,"Failed to create memory object\n");
-      }
-      gpu_ptr_ = (void*)tmpMem;
-    }
-    OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)gpu_cache_ptr_, (cl_mem)gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
-    clFinish(amdDevice.CommandQueue);
-    head_ = SYNCED;
+	switch (head_) {
+		case UNINITIALIZED: {
+			cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+				size_, NULL, NULL);
+			if (NULL == tmpMem) {
+				fprintf(stderr, "Failed to create memory object\n");
+				break;
+			}
+			ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int)));
+			gpu_ptr_ = (void*) tmpMem;
+			head_ = HEAD_AT_GPU;
+			break;
+		}
+		case HEAD_AT_CPU: {
+			if (gpu_ptr_ == NULL) {
+				cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+					size_, NULL, NULL);
+				if (NULL == tmpMem) {
+					fprintf(stderr, "Failed to create memory object\n");
+				}
+				gpu_ptr_ = (void*) tmpMem;
+			}
+			OCL_CHECK(
+				clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+					(cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
+			clFinish(amdDevice.CommandQueue);
+			head_ = SYNCED;
 #ifdef Track_data_transfer
-    LOG(WARNING) << "sync: data from CPU to GPU";
+			LOG(WARNING) << "sync: data from CPU to GPU";
 #endif
-    break;
-  }
-  case HEAD_AT_GPU:
-  case SYNCED:
-    break;
-  }
+			break;
+		}
+		case HEAD_AT_GPU:
+			case SYNCED:
+			break;
+	}
 #else
-  NO_GPU;
+	NO_GPU;
 #endif
 }
 
 const void* SyncedMemory::cpu_data() {
-  to_cpu();
-  return (const void*)cpu_ptr_;
+	to_cpu();
+	return (const void*) cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-  CHECK(data);
-  if (own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
-  }
-  cpu_ptr_ = data;
-  head_ = HEAD_AT_CPU;
-  own_cpu_data_ = false;
+	CHECK(data);
+	if (own_cpu_data_) {
+		CaffeFreeHost (cpu_ptr_);
+	}
+	cpu_ptr_ = data;
+	head_ = HEAD_AT_CPU;
+	own_cpu_data_ = false;
 }
 
 const void* SyncedMemory::gpu_data() {
 #ifndef CPU_ONLY
-  to_gpu();
-  return (const void*)gpu_ptr_;
+	to_gpu();
+	return (const void*) gpu_ptr_;
 #else
-  NO_GPU;
+	NO_GPU;
 #endif
 }
 
 void* SyncedMemory::mutable_cpu_data() {
-  to_cpu();
-  head_ = HEAD_AT_CPU;
-  return cpu_ptr_;
+	to_cpu();
+	head_ = HEAD_AT_CPU;
+	return cpu_ptr_;
 }
 
 void* SyncedMemory::mutable_gpu_data() {
 #ifndef CPU_ONLY
-  to_gpu();
-  head_ = HEAD_AT_GPU;
-  return gpu_ptr_;
+	to_gpu();
+	head_ = HEAD_AT_GPU;
+	return gpu_ptr_;
 #else
-  NO_GPU;
+	NO_GPU;
 #endif
 }
 
 const void *SyncedMemory::gpu_cache_data() {
-  return 0;
+	return 0;
 }
 
-
 }  // namespace caffe
 
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 0383fd27..7d0a85aa 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -6,114 +6,113 @@
 namespace caffe {
 
 Timer::Timer()
-    : initted_(false),
-      running_(false),
-      has_run_at_least_once_(false) {
-  Init();
+	: initted_(false),
+		running_(false),
+		has_run_at_least_once_(false) {
+	Init();
 }
 
 Timer::~Timer() {
 }
 
 void Timer::Start() {
-  if (!running()) {
-    start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    running_ = true;
-    has_run_at_least_once_ = true;
-  }
+	if (!running()) {
+		start_cpu_ = boost::posix_time::microsec_clock::local_time();
+		running_ = true;
+		has_run_at_least_once_ = true;
+	}
 }
 
 void Timer::Stop() {
-  if (running()) {
-    stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    running_ = false;
-  }
+	if (running()) {
+		stop_cpu_ = boost::posix_time::microsec_clock::local_time();
+		running_ = false;
+	}
 }
 
-
 float Timer::MicroSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  
-  elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
-  return elapsed_microseconds_;
+	if (!has_run_at_least_once()) {
+		LOG(WARNING) << "Timer has never been run before reading time.";
+		return 0;
+	}
+	if (running()) {
+		Stop();
+	}
+
+	elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
+	return elapsed_microseconds_;
 }
 
 float Timer::MilliSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
- 
-  elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
-  return elapsed_milliseconds_;
+	if (!has_run_at_least_once()) {
+		LOG(WARNING) << "Timer has never been run before reading time.";
+		return 0;
+	}
+	if (running()) {
+		Stop();
+	}
+
+	elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
+	return elapsed_milliseconds_;
 }
 
 float Timer::Seconds() {
-  return MilliSeconds() / 1000.;
+	return MilliSeconds() / 1000.;
 }
 
 void Timer::Init() {
-  if (!initted()) {
-    if (Caffe::mode() == Caffe::GPU) {
-    }
-    initted_ = true;
-  }
+	if (!initted()) {
+		if (Caffe::mode() == Caffe::GPU) {
+		}
+		initted_ = true;
+	}
 }
 
 CPUTimer::CPUTimer() {
-  this->initted_ = true;
-  this->running_ = false;
-  this->has_run_at_least_once_ = false;
+	this->initted_ = true;
+	this->running_ = false;
+	this->has_run_at_least_once_ = false;
 }
 
 void CPUTimer::Start() {
-  if (!running()) {
-    this->start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    this->running_ = true;
-    this->has_run_at_least_once_ = true;
-  }
+	if (!running()) {
+		this->start_cpu_ = boost::posix_time::microsec_clock::local_time();
+		this->running_ = true;
+		this->has_run_at_least_once_ = true;
+	}
 }
 
 void CPUTimer::Stop() {
-  if (running()) {
-    this->stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    this->running_ = false;
-  }
+	if (running()) {
+		this->stop_cpu_ = boost::posix_time::microsec_clock::local_time();
+		this->running_ = false;
+	}
 }
 
 float CPUTimer::MilliSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  this->elapsed_milliseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_milliseconds();
-  return this->elapsed_milliseconds_;
+	if (!has_run_at_least_once()) {
+		LOG(WARNING) << "Timer has never been run before reading time.";
+		return 0;
+	}
+	if (running()) {
+		Stop();
+	}
+	this->elapsed_milliseconds_ = (this->stop_cpu_ -
+		this->start_cpu_).total_milliseconds();
+	return this->elapsed_milliseconds_;
 }
 
 float CPUTimer::MicroSeconds() {
-  if (!has_run_at_least_once()) {
-    LOG(WARNING) << "Timer has never been run before reading time.";
-    return 0;
-  }
-  if (running()) {
-    Stop();
-  }
-  this->elapsed_microseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_microseconds();
-  return this->elapsed_microseconds_;
+	if (!has_run_at_least_once()) {
+		LOG(WARNING) << "Timer has never been run before reading time.";
+		return 0;
+	}
+	if (running()) {
+		Stop();
+	}
+	this->elapsed_microseconds_ = (this->stop_cpu_ -
+		this->start_cpu_).total_microseconds();
+	return this->elapsed_microseconds_;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp
index 1772f009..43492ce7 100644
--- a/src/caffe/util/cudnn.cpp
+++ b/src/caffe/util/cudnn.cpp
@@ -2,22 +2,22 @@
 #include "caffe/util/cudnn.hpp"
 
 namespace caffe {
-namespace cudnn {
+	namespace cudnn {
 
-float dataType<float>::oneval = 1.0;
-float dataType<float>::zeroval = 0.0;
-const void* dataType<float>::one =
-    static_cast<void *>(&dataType<float>::oneval);
-const void* dataType<float>::zero =
-    static_cast<void *>(&dataType<float>::zeroval);
+		float dataType<float>::oneval = 1.0;
+		float dataType<float>::zeroval = 0.0;
+		const void* dataType<float>::one =
+		static_cast<void *>(&dataType<float>::oneval);
+		const void* dataType<float>::zero =
+		static_cast<void *>(&dataType<float>::zeroval);
 
-double dataType<double>::oneval = 1.0;
-double dataType<double>::zeroval = 0.0;
-const void* dataType<double>::one =
-    static_cast<void *>(&dataType<double>::oneval);
-const void* dataType<double>::zero =
-    static_cast<void *>(&dataType<double>::zeroval);
+		double dataType<double>::oneval = 1.0;
+		double dataType<double>::zeroval = 0.0;
+		const void* dataType<double>::one =
+		static_cast<void *>(&dataType<double>::oneval);
+		const void* dataType<double>::zero =
+		static_cast<void *>(&dataType<double>::zeroval);
 
-}  // namespace cudnn
+	}  // namespace cudnn
 }  // namespace caffe
 #endif
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
index f55420e9..50d8cbf7 100644
--- a/src/caffe/util/db.cpp
+++ b/src/caffe/util/db.cpp
@@ -4,27 +4,28 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 DB* GetDB(DataParameter::DB backend) {
-  switch (backend) {
-  case DataParameter_DB_LEVELDB:
-    return new LevelDB();
-  case DataParameter_DB_LMDB:
-    return new LMDB();
-  default:
-    LOG(FATAL) << "Unknown database backend";
-  }
+	switch (backend) {
+		case DataParameter_DB_LEVELDB:
+			return new LevelDB();
+		case DataParameter_DB_LMDB:
+			return new LMDB();
+		default:
+			LOG(FATAL) << "Unknown database backend";
+	}
 }
 
 DB* GetDB(const string& backend) {
-  if (backend == "leveldb") {
-    return new LevelDB();
-  } else if (backend == "lmdb") {
-    return new LMDB();
-  } else {
-    LOG(FATAL) << "Unknown database backend";
-  }
+	if (backend == "leveldb") {
+		return new LevelDB();
+	} else if (backend == "lmdb") {
+		return new LMDB();
+	} else {
+		LOG(FATAL) << "Unknown database backend";
+	}
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index 06c46627..aec747af 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -2,19 +2,20 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 void LevelDB::Open(const string& source, Mode mode) {
-  leveldb::Options options;
-  options.block_size = 65536;
-  options.write_buffer_size = 268435456;
-  options.max_open_files = 100;
-  options.error_if_exists = mode == NEW;
-  options.create_if_missing = mode != READ;
-  leveldb::Status status = leveldb::DB::Open(options, source, &db_);
-  CHECK(status.ok()) << "Failed to open leveldb " << source
-                     << std::endl << status.ToString();
-  LOG(INFO) << "Opened leveldb " << source;
+	leveldb::Options options;
+	options.block_size = 65536;
+	options.write_buffer_size = 268435456;
+	options.max_open_files = 100;
+	options.error_if_exists = mode == NEW;
+	options.create_if_missing = mode != READ;
+	leveldb::Status status = leveldb::DB::Open(options, source, &db_);
+	CHECK(status.ok()) << "Failed to open leveldb " << source
+		<< std::endl << status.ToString();
+	LOG(INFO) << "Opened leveldb " << source;
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index a054b796..bc1a0da1 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -4,47 +4,48 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 const size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
 
 void LMDB::Open(const string& source, Mode mode) {
-  MDB_CHECK(mdb_env_create(&mdb_env_));
-  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
-  if (mode == NEW) {
-    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
-  }
-  int flags = 0;
-  if (mode == READ) {
-    flags = MDB_RDONLY | MDB_NOTLS;
-  }
-  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
-  LOG(INFO) << "Opened lmdb " << source;
+	MDB_CHECK(mdb_env_create(&mdb_env_));
+	MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
+	if(mode == NEW) {
+		CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+	}
+	int flags = 0;
+	if (mode == READ) {
+		flags = MDB_RDONLY | MDB_NOTLS;
+	}
+	MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+	LOG(INFO) << "Opened lmdb " << source;
 }
 
 LMDBCursor* LMDB::NewCursor() {
-  MDB_txn* mdb_txn;
-  MDB_cursor* mdb_cursor;
-  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn));
-  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-  MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor));
-  return new LMDBCursor(mdb_txn, mdb_cursor);
+	MDB_txn* mdb_txn;
+	MDB_cursor* mdb_cursor;
+	MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn));
+	MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
+	MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor));
+	return new LMDBCursor(mdb_txn, mdb_cursor);
 }
 
 LMDBTransaction* LMDB::NewTransaction() {
-  MDB_txn* mdb_txn;
-  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
-  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-  return new LMDBTransaction(&mdb_dbi_, mdb_txn);
+	MDB_txn* mdb_txn;
+	MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
+	MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
+	return new LMDBTransaction(&mdb_dbi_, mdb_txn);
 }
 
 void LMDBTransaction::Put(const string& key, const string& value) {
-  MDB_val mdb_key, mdb_value;
-  mdb_key.mv_data = const_cast<char*>(key.data());
-  mdb_key.mv_size = key.size();
-  mdb_value.mv_data = const_cast<char*>(value.data());
-  mdb_value.mv_size = value.size();
-  MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0));
+	MDB_val mdb_key, mdb_value;
+	mdb_key.mv_data = const_cast<char*>(key.data());
+	mdb_key.mv_size = key.size();
+	mdb_value.mv_data = const_cast<char*>(value.data());
+	mdb_value.mv_size = value.size();
+	MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0));
 }
 
 }  // namespace db
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 089023b7..69cc47bc 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -34,330 +34,352 @@
 
 namespace caffe {
 
-template <typename dtype> extern std::string get_dtype_suffix();
+template<typename dtype> extern std::string get_dtype_suffix();
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_h * kernel_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % kernel_w;
-    int h_offset = (c / kernel_w) % kernel_h;
-    int c_im = c / kernel_h / kernel_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(c * height_col + h) * width_col + w] =
-            data_im[(c_im * height + h_pad) * width + w_pad];
-        else
-          data_col[(c * height_col + h) * width_col + w] = 0;
-      }
-    }
-  }
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_col) {
+	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+	int channels_col = channels * kernel_h * kernel_w;
+	for (int c = 0; c < channels_col; ++c) {
+		int w_offset = c % kernel_w;
+		int h_offset = (c / kernel_w) % kernel_h;
+		int c_im = c / kernel_h / kernel_w;
+		for (int h = 0; h < height_col; ++h) {
+			for (int w = 0; w < width_col; ++w) {
+				int h_pad = h * stride_h - pad_h + h_offset;
+				int w_pad = w * stride_w - pad_w + w_offset;
+				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+					data_col[(c * height_col + h) * width_col + w] =
+						data_im[(c_im * height + h_pad) * width + w_pad];
+				else
+					data_col[(c * height_col + h) * width_col + w] = 0;
+			}
+		}
+	}
 }
 
 template void im2col_cpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, double* data_col);
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_im) {
-  caffe_set(height * width * channels, Dtype(0), data_im);
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_h * patch_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int c_im = c / patch_h / patch_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(c_im * height + h_pad) * width + w_pad] +=
-              data_col[(c * height_col + h) * width_col + w];
-      }
-    }
-  }
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_im) {
+	caffe_set(height * width * channels, Dtype(0), data_im);
+	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+	int channels_col = channels * patch_h * patch_w;
+	for (int c = 0; c < channels_col; ++c) {
+		int w_offset = c % patch_w;
+		int h_offset = (c / patch_w) % patch_h;
+		int c_im = c / patch_h / patch_w;
+		for (int h = 0; h < height_col; ++h) {
+			for (int w = 0; w < width_col; ++w) {
+				int h_pad = h * stride_h - pad_h + h_offset;
+				int w_pad = w * stride_w - pad_w + w_offset;
+				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+					data_im[(c_im * height + h_pad) * width + w_pad] +=
+						data_col[(c * height_col + h) * width_col + w];
+			}
+		}
+	}
 }
 
 template void col2im_cpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
-
-
-template <typename Dtype>
-void col2im_gpu_opt(const Dtype* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset, int optnum){
-    std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = channels * height * width;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset);
-    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, double* data_im);
+
+template<typename Dtype>
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_im, const int img_offset, int optnum) {
+	std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	int height_col = (height + 2 * pad - ksize) / stride + 1;
+	int width_col = (width + 2 * pad - ksize) / stride + 1;
+	int num_kernels = channels * height * width;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
-template void col2im_gpu_opt<float>(const float* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_im, const int img_offset, int optnum);
-template void col2im_gpu_opt<double>(const double* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_im, const int img_offset, int optnum);
-
-template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, 
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col, const int col_offset)
-{
-    std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-    int num_kernels = channels * height_col * width_col;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&kernel_h);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&kernel_w);
-
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad_h);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_w);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&stride_h);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_w);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,13,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,14,sizeof(cl_int),(void*)&col_offset);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+template void col2im_gpu_opt<float>(const float* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, float* data_im, const int img_offset, int optnum);
+template void col2im_gpu_opt<double>(const double* data_col,
+	const int col_offset, const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, double* data_im, const int img_offset, int optnum);
+
+template<typename Dtype>
+void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_col, const int col_offset)
+	{
+	std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+	int num_kernels = channels * height_col * width_col;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w);
+
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 
 }
 
-template void im2col_gpu<float>(const float* data_im, const int img_offset, const int channels,       
-    				const int height, const int width, const int kernel_h, const int kernel_w,
-    				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    				float* data_col, const int col_offset);
-template void im2col_gpu<double>(const double* data_im, const int img_offset, const int channels,       
-    				const int height, const int width, const int kernel_h, const int kernel_w,
-    				const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    				double* data_col, const int col_offset);
-
-template <typename Dtype>
+template void im2col_gpu<float>(const float* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	float* data_col, const int col_offset);
+template void im2col_gpu<double>(const double* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	double* data_col, const int col_offset);
+
+template<typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_im, const int img_offset)
-{
-    std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-    int num_kernels = channels * height * width;
-    
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&patch_h);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&patch_w);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&pad_h);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&pad_w);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&stride_h);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&stride_w);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,14,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,15,sizeof(cl_int),(void*)&img_offset);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_im, const int img_offset)
+	{
+	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+	int num_kernels = channels * height * width;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-    				const int height, const int width, const int channels,
-    				const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-    				const int stride_h, const int stride_w, float* data_im, const int img_offset);
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w, float* data_im, const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-    				const int height, const int width, const int channels,
-    				const int patch_h, const int patch_w,
-    				const int pad_h, const int pad_w,const int stride_h, const int stride_w,
-    				double* data_im, const int img_offset);
-
-template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset) {
-
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = channels * height_col * width_col;
-    
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&img_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_int),(void*)&col_offset);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-    clFinish(amdDevice.CommandQueue);
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	double* data_im, const int img_offset);
+
+template<typename Dtype>
+void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, const int col_offset) {
+
+	int height_col = (height + 2 * pad - ksize) / stride + 1;
+	int width_col = (width + 2 * pad - ksize) / stride + 1;
+	int num_kernels = channels * height_col * width_col;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+	clFinish(amdDevice.CommandQueue);
 }
 
-template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col, const int col_offset);
-template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col, const int col_offset);
-
-template <typename Dtype>
-void im2col_gpu_opt(const Dtype* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_col, const int col_offset, int optnum) {
-
-    std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = optnum * channels * height_col * width_col;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&img_offset);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&col_offset);
-    ret|=clSetKernelArg(Kernel,13,sizeof(cl_int),(void*)&optnum);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {(size_t)(256 - 256 % width_col)};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im,
+	const int img_offset, const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, float* data_col, const int col_offset);
+template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im,
+	const int img_offset, const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, double* data_col, const int col_offset);
+
+template<typename Dtype>
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_col, const int col_offset, int optnum) {
+
+	std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	int height_col = (height + 2 * pad - ksize) / stride + 1;
+	int width_col = (width + 2 * pad - ksize) / stride + 1;
+	int num_kernels = optnum * channels * height_col * width_col;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
-template void im2col_gpu_opt<float>(const float* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, float* data_col, const int col_offset, int optnum);
-template void im2col_gpu_opt<double>(const double* data_im, const int img_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, double* data_col, const int col_offset,  int optnum);
+template void im2col_gpu_opt<float>(const float* data_im, const int img_offset,
+	const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, float* data_col, const int col_offset, int optnum);
+template void im2col_gpu_opt<double>(const double* data_im,
+	const int img_offset, const int channels,
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, double* data_col, const int col_offset, int optnum);
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset) {
-    std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int num_kernels = channels * height * width;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_col);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&col_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_int),(void*)&ksize);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&pad);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&stride);
-    ret|=clSetKernelArg(Kernel,9,sizeof(cl_int),(void*)&height_col);
-    ret|=clSetKernelArg(Kernel,10,sizeof(cl_int),(void*)&width_col);
-    ret|=clSetKernelArg(Kernel,11,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,12,sizeof(cl_int),(void*)&img_offset);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+	const int height, const int width, const int ksize, const int pad,
+	const int stride, Dtype* data_im, const int img_offset) {
+	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	int height_col = (height + 2 * pad - ksize) / stride + 1;
+	int width_col = (width + 2 * pad - ksize) / stride + 1;
+	int num_kernels = channels * height * width;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
-
-template void col2im_gpu<float>(const float* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, float* data_im, const int img_offset);
-template void col2im_gpu<double>(const double* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int psize, const int pad,
-    const int stride, double* data_im, const int img_offset);
-
+template void col2im_gpu<float>(const float* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int psize, const int pad,
+	const int stride, float* data_im, const int img_offset);
+template void col2im_gpu<double>(const double* data_col, const int col_offset,
+	const int channels,
+	const int height, const int width, const int psize, const int pad,
+	const int stride, double* data_im, const int img_offset);
 
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index d52acb54..be0ce3b4 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -10,125 +10,124 @@ namespace caffe {
 
 template <typename Dtype>
 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    Dtype* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    Dtype* data_col_ptr = data_col;
-    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-    const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + i;
-        int w = w_in + j;
-        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-            data_im_ptr[i * width + j] : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int height_col, const int width_col,
+	Dtype* data_col) {
+	CUDA_KERNEL_LOOP(index, n) {
+		int w_out = index % width_col;
+		int h_index = index / width_col;
+		int h_out = h_index % height_col;
+		int channel_in = h_index / height_col;
+		int channel_out = channel_in * kernel_h * kernel_w;
+		int h_in = h_out * stride_h - pad_h;
+		int w_in = w_out * stride_w - pad_w;
+		Dtype* data_col_ptr = data_col;
+		data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+		const Dtype* data_im_ptr = data_im;
+		data_im_ptr += (channel_in * height + h_in) * width + w_in;
+		for (int i = 0; i < kernel_h; ++i) {
+			for (int j = 0; j < kernel_w; ++j) {
+				int h = h_in + i;
+				int w = w_in + j;
+				*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+				data_im_ptr[i * width + j] : 0;
+				data_col_ptr += height_col * width_col;
+			}
+		}
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col) {
-  // We are going to launch channels * height_col * width_col kernels, each
-  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
-  CUDA_POST_KERNEL_CHECK;
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	Dtype* data_col) {
+	// We are going to launch channels * height_col * width_col kernels, each
+	// kernel responsible for copying a single-channel grid.
+	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+	int num_kernels = channels * height_col * width_col;
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
+	CAFFE_CUDA_NUM_THREADS>>>(
+		num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+		pad_w, stride_h, stride_w, height_col,
+		width_col, data_col);
+	CUDA_POST_KERNEL_CHECK;
 }
 
-
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	float* data_col);
 template void im2col_gpu<double>(const double* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    double* data_col);
+	const int height, const int width, const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+	double* data_col);
 
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
-    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int height_col, const int width_col,
-    Dtype* data_im) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
-    // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
-    // equivalent implementation
-    int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-    int coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
-    }
-    data_im[index] = val;
-  }
+	const int height, const int width, const int channels,
+	const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int height_col, const int width_col,
+	Dtype* data_im) {
+	CUDA_KERNEL_LOOP(index, n) {
+		Dtype val = 0;
+		int w = index % width + pad_w;
+		int h = (index / width) % height + pad_h;
+		int c = index / (width * height);
+		// compute the start and end of the output
+		int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+		int w_col_end = min(w / stride_w + 1, width_col);
+		int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+		int h_col_end = min(h / stride_h + 1, height_col);
+		// equivalent implementation
+		int offset =
+		(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+		int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+		int coeff_w_col = (1 - stride_w * height_col * width_col);
+		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+			}
+		}
+		data_im[index] = val;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int num_kernels = channels * height * width;
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
-      pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im);
-  CUDA_POST_KERNEL_CHECK;
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, Dtype* data_im) {
+	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+	int num_kernels = channels * height * width;
+	// To avoid involving atomic operations, we will launch one kernel per
+	// bottom dimension, and then in the kernel add up the top dimensions.
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
+	CAFFE_CUDA_NUM_THREADS>>>(
+		num_kernels, data_col, height, width, channels, patch_h, patch_w,
+		pad_h, pad_w, stride_h, stride_w,
+		height_col, width_col, data_im);
+	CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
+	const int height, const int width, const int patch_h, const int patch_w,
+	const int pad_h, const int pad_w, const int stride_h,
+	const int stride_w, double* data_im);
 
 }  // namespace caffe
diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp
index 416f80ab..2fbad3a9 100644
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@@ -10,135 +10,135 @@
 namespace caffe {
 
 void InsertSplits(const NetParameter& param, NetParameter* param_split) {
-  // Initialize by copying from the input NetParameter.
-  param_split->CopyFrom(param);
-  param_split->clear_layer();
-  map<string, pair<int, int> > blob_name_to_last_top_idx;
-  map<pair<int, int>, pair<int, int> > bottom_idx_to_source_top_idx;
-  map<pair<int, int>, int> top_idx_to_bottom_count;
-  map<pair<int, int>, float> top_idx_to_loss_weight;
-  map<pair<int, int>, int> top_idx_to_bottom_split_idx;
-  map<int, string> layer_idx_to_layer_name;
-  layer_idx_to_layer_name[-1] = "input";
-  // Determine the number of times each blob is used as an input (bottom) blob.
-  for (int i = 0; i < param.input_size(); ++i) {
-    const string& blob_name = param.input(i);
-    blob_name_to_last_top_idx[blob_name] = make_pair(-1, i);
-  }
-  for (int i = 0; i < param.layer_size(); ++i) {
-    const LayerParameter& layer_param = param.layer(i);
-    layer_idx_to_layer_name[i] = layer_param.name();
-    for (int j = 0; j < layer_param.bottom_size(); ++j) {
-      const string& blob_name = layer_param.bottom(j);
-      if (blob_name_to_last_top_idx.find(blob_name) ==
-          blob_name_to_last_top_idx.end()) {
-        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
-      }
-      const pair<int, int>& bottom_idx = make_pair(i, j);
-      const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
-      bottom_idx_to_source_top_idx[bottom_idx] = top_idx;
-      ++top_idx_to_bottom_count[top_idx];
-    }
-    for (int j = 0; j < layer_param.top_size(); ++j) {
-      const string& blob_name = layer_param.top(j);
-      blob_name_to_last_top_idx[blob_name] = make_pair(i, j);
-    }
-    // A use of a top blob as a loss should be handled similarly to the use of
-    // a top blob as an input (bottom) blob to another layer.
-    const int last_loss =
-        std::min(layer_param.loss_weight_size(), layer_param.top_size());
-    for (int j = 0; j < last_loss; ++j) {
-      const string& blob_name = layer_param.top(j);
-      const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
-      top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j);
-      if (top_idx_to_loss_weight[top_idx]) {
-        ++top_idx_to_bottom_count[top_idx];
-      }
-    }
-  }
-  // Create split layer for any input blobs used by other layer as bottom
-  // blobs more than once.
-  for (int i = 0; i < param.input_size(); ++i) {
-    const int split_count = top_idx_to_bottom_count[make_pair(-1, i)];
-    if (split_count > 1) {
-      const string& layer_name = layer_idx_to_layer_name[-1];
-      const string& blob_name = param.input(i);
-      LayerParameter* split_layer_param = param_split->add_layer();
-      const float kZeroLossWeight = 0;
-      ConfigureSplitLayer(layer_name, blob_name, i, split_count,
-          kZeroLossWeight, split_layer_param);
-    }
-  }
-  for (int i = 0; i < param.layer_size(); ++i) {
-    LayerParameter* layer_param = param_split->add_layer();
-    layer_param->CopyFrom(param.layer(i));
-    // Replace any shared bottom blobs with split layer outputs.
-    for (int j = 0; j < layer_param->bottom_size(); ++j) {
-      const pair<int, int>& top_idx =
-          bottom_idx_to_source_top_idx[make_pair(i, j)];
-      const int split_count = top_idx_to_bottom_count[top_idx];
-      if (split_count > 1) {
-        const string& layer_name = layer_idx_to_layer_name[top_idx.first];
-        const string& blob_name = layer_param->bottom(j);
-        layer_param->set_bottom(j, SplitBlobName(layer_name,
-            blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
-      }
-    }
-    // Create split layer for any top blobs used by other layer as bottom
-    // blobs more than once.
-    for (int j = 0; j < layer_param->top_size(); ++j) {
-      const pair<int, int>& top_idx = make_pair(i, j);
-      const int split_count = top_idx_to_bottom_count[top_idx];
-      if (split_count > 1) {
-        const string& layer_name = layer_idx_to_layer_name[i];
-        const string& blob_name = layer_param->top(j);
-        LayerParameter* split_layer_param = param_split->add_layer();
-        const float loss_weight = top_idx_to_loss_weight[top_idx];
-        ConfigureSplitLayer(layer_name, blob_name, j, split_count,
-            loss_weight, split_layer_param);
-        if (loss_weight) {
-          layer_param->clear_loss_weight();
-          top_idx_to_bottom_split_idx[top_idx]++;
-        }
-      }
-    }
-  }
+	// Initialize by copying from the input NetParameter.
+	param_split->CopyFrom(param);
+	param_split->clear_layer();
+	map<string, pair<int, int> > blob_name_to_last_top_idx;
+	map<pair<int, int>, pair<int, int> > bottom_idx_to_source_top_idx;
+	map<pair<int, int>, int> top_idx_to_bottom_count;
+	map<pair<int, int>, float> top_idx_to_loss_weight;
+	map<pair<int, int>, int> top_idx_to_bottom_split_idx;
+	map<int, string> layer_idx_to_layer_name;
+	layer_idx_to_layer_name[-1] = "input";
+	// Determine the number of times each blob is used as an input (bottom) blob.
+	for (int i = 0; i < param.input_size(); ++i) {
+		const string& blob_name = param.input(i);
+		blob_name_to_last_top_idx[blob_name] = make_pair(-1, i);
+	}
+	for (int i = 0; i < param.layer_size(); ++i) {
+		const LayerParameter& layer_param = param.layer(i);
+		layer_idx_to_layer_name[i] = layer_param.name();
+		for (int j = 0; j < layer_param.bottom_size(); ++j) {
+			const string& blob_name = layer_param.bottom(j);
+			if (blob_name_to_last_top_idx.find(blob_name) ==
+				blob_name_to_last_top_idx.end()) {
+				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+			}
+			const pair<int, int>& bottom_idx = make_pair(i, j);
+			const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
+			bottom_idx_to_source_top_idx[bottom_idx] = top_idx;
+			++top_idx_to_bottom_count[top_idx];
+		}
+		for (int j = 0; j < layer_param.top_size(); ++j) {
+			const string& blob_name = layer_param.top(j);
+			blob_name_to_last_top_idx[blob_name] = make_pair(i, j);
+		}
+		// A use of a top blob as a loss should be handled similarly to the use of
+		// a top blob as an input (bottom) blob to another layer.
+		const int last_loss =
+			std::min(layer_param.loss_weight_size(), layer_param.top_size());
+		for (int j = 0; j < last_loss; ++j) {
+			const string& blob_name = layer_param.top(j);
+			const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
+			top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j);
+			if (top_idx_to_loss_weight[top_idx]) {
+				++top_idx_to_bottom_count[top_idx];
+			}
+		}
+	}
+	// Create split layer for any input blobs used by other layer as bottom
+	// blobs more than once.
+	for (int i = 0; i < param.input_size(); ++i) {
+		const int split_count = top_idx_to_bottom_count[make_pair(-1, i)];
+		if (split_count > 1) {
+			const string& layer_name = layer_idx_to_layer_name[-1];
+			const string& blob_name = param.input(i);
+			LayerParameter* split_layer_param = param_split->add_layer();
+			const float kZeroLossWeight = 0;
+			ConfigureSplitLayer(layer_name, blob_name, i, split_count,
+				kZeroLossWeight, split_layer_param);
+		}
+	}
+	for (int i = 0; i < param.layer_size(); ++i) {
+		LayerParameter* layer_param = param_split->add_layer();
+		layer_param->CopyFrom(param.layer(i));
+		// Replace any shared bottom blobs with split layer outputs.
+		for (int j = 0; j < layer_param->bottom_size(); ++j) {
+			const pair<int, int>& top_idx =
+				bottom_idx_to_source_top_idx[make_pair(i, j)];
+			const int split_count = top_idx_to_bottom_count[top_idx];
+			if (split_count > 1) {
+				const string& layer_name = layer_idx_to_layer_name[top_idx.first];
+				const string& blob_name = layer_param->bottom(j);
+				layer_param->set_bottom(j, SplitBlobName(layer_name,
+					blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
+			}
+		}
+		// Create split layer for any top blobs used by other layer as bottom
+		// blobs more than once.
+		for (int j = 0; j < layer_param->top_size(); ++j) {
+			const pair<int, int>& top_idx = make_pair(i, j);
+			const int split_count = top_idx_to_bottom_count[top_idx];
+			if (split_count > 1) {
+				const string& layer_name = layer_idx_to_layer_name[i];
+				const string& blob_name = layer_param->top(j);
+				LayerParameter* split_layer_param = param_split->add_layer();
+				const float loss_weight = top_idx_to_loss_weight[top_idx];
+				ConfigureSplitLayer(layer_name, blob_name, j, split_count,
+					loss_weight, split_layer_param);
+				if (loss_weight) {
+					layer_param->clear_loss_weight();
+					top_idx_to_bottom_split_idx[top_idx]++;
+				}
+			}
+		}
+	}
 }
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-    const int blob_idx, const int split_count, const float loss_weight,
-    LayerParameter* split_layer_param) {
-  split_layer_param->Clear();
-  split_layer_param->add_bottom(blob_name);
-  split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx));
-  split_layer_param->set_type("Split");
-  for (int k = 0; k < split_count; ++k) {
-    split_layer_param->add_top(
-        SplitBlobName(layer_name, blob_name, blob_idx, k));
-    if (loss_weight) {
-      if (k == 0) {
-        split_layer_param->add_loss_weight(loss_weight);
-      } else {
-        split_layer_param->add_loss_weight(0);
-      }
-    }
-  }
+	const int blob_idx, const int split_count, const float loss_weight,
+	LayerParameter* split_layer_param) {
+	split_layer_param->Clear();
+	split_layer_param->add_bottom(blob_name);
+	split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx));
+	split_layer_param->set_type("Split");
+	for (int k = 0; k < split_count; ++k) {
+		split_layer_param->add_top(
+			SplitBlobName(layer_name, blob_name, blob_idx, k));
+		if (loss_weight) {
+			if (k == 0) {
+				split_layer_param->add_loss_weight(loss_weight);
+			} else {
+				split_layer_param->add_loss_weight(0);
+			}
+		}
+	}
 }
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-    const int blob_idx) {
-  ostringstream split_layer_name;
-  split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
-      << "_split";
-  return split_layer_name.str();
+	const int blob_idx) {
+	ostringstream split_layer_name;
+	split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
+		<< "_split";
+	return split_layer_name.str();
 }
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-    const int blob_idx, const int split_idx) {
-  ostringstream split_blob_name;
-  split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
-      << "_split_" << split_idx;
-  return split_blob_name.str();
+	const int blob_idx, const int split_idx) {
+	ostringstream split_blob_name;
+	split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
+		<< "_split_" << split_idx;
+	return split_blob_name.str();
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 77ef7f25..c3be8a76 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -30,277 +30,277 @@ using google::protobuf::io::CodedOutputStream;
 using google::protobuf::Message;
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto) {
-  int fd = open(filename, O_RDONLY);
-  CHECK_NE(fd, -1) << "File not found: " << filename;
-  FileInputStream* input = new FileInputStream(fd);
-  bool success = google::protobuf::TextFormat::Parse(input, proto);
-  delete input;
-  close(fd);
-  return success;
+	int fd = open(filename, O_RDONLY);
+	CHECK_NE(fd, -1) << "File not found: " << filename;
+	FileInputStream* input = new FileInputStream(fd);
+	bool success = google::protobuf::TextFormat::Parse(input, proto);
+	delete input;
+	close(fd);
+	return success;
 }
 
 void WriteProtoToTextFile(const Message& proto, const char* filename) {
-  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  FileOutputStream* output = new FileOutputStream(fd);
-  CHECK(google::protobuf::TextFormat::Print(proto, output));
-  delete output;
-  close(fd);
+	int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+	FileOutputStream* output = new FileOutputStream(fd);
+	CHECK(google::protobuf::TextFormat::Print(proto, output));
+	delete output;
+	close(fd);
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
-  int fd = open(filename, O_RDONLY);
-  CHECK_NE(fd, -1) << "File not found: " << filename;
-  ZeroCopyInputStream* raw_input = new FileInputStream(fd);
-  CodedInputStream* coded_input = new CodedInputStream(raw_input);
-  coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912);
+	int fd = open(filename, O_RDONLY);
+	CHECK_NE(fd, -1) << "File not found: " << filename;
+	ZeroCopyInputStream* raw_input = new FileInputStream(fd);
+	CodedInputStream* coded_input = new CodedInputStream(raw_input);
+	coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912);
 
-  bool success = proto->ParseFromCodedStream(coded_input);
+	bool success = proto->ParseFromCodedStream(coded_input);
 
-  delete coded_input;
-  delete raw_input;
-  close(fd);
-  return success;
+	delete coded_input;
+	delete raw_input;
+	close(fd);
+	return success;
 }
 
 void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
-  fstream output(filename, ios::out | ios::trunc | ios::binary);
-  CHECK(proto.SerializeToOstream(&output));
+	fstream output(filename, ios::out | ios::trunc | ios::binary);
+	CHECK(proto.SerializeToOstream(&output));
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color) {
-  cv::Mat cv_img;
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
-  cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
-  if (!cv_img_origin.data) {
-    LOG(ERROR) << "Could not open or find file " << filename;
-    return cv_img_origin;
-  }
-  if (height > 0 && width > 0) {
-    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
-  } else {
-    cv_img = cv_img_origin;
-  }
-  return cv_img;
+	const int height, const int width, const bool is_color) {
+	cv::Mat cv_img;
+	int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
+																	CV_LOAD_IMAGE_GRAYSCALE);
+	cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
+	if (!cv_img_origin.data) {
+		LOG(ERROR) << "Could not open or find file " << filename;
+		return cv_img_origin;
+	}
+	if (height > 0 && width > 0) {
+		cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
+	} else {
+		cv_img = cv_img_origin;
+	}
+	return cv_img;
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width) {
-  return ReadImageToCVMat(filename, height, width, true);
+	const int height, const int width) {
+	return ReadImageToCVMat(filename, height, width, true);
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color) {
-  return ReadImageToCVMat(filename, 0, 0, is_color);
+	const bool is_color) {
+	return ReadImageToCVMat(filename, 0, 0, is_color);
 }
 
 cv::Mat ReadImageToCVMat(const string& filename) {
-  return ReadImageToCVMat(filename, 0, 0, true);
+	return ReadImageToCVMat(filename, 0, 0, true);
 }
 // Do the file extension and encoding match?
 static bool matchExt(const std::string & fn,
-                     std::string en) {
-  size_t p = fn.rfind('.');
-  std::string ext = p != fn.npos ? fn.substr(p) : fn;
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  std::transform(en.begin(), en.end(), en.begin(), ::tolower);
-  if ( ext == en )
-    return true;
-  if ( en == "jpg" && ext == "jpeg" )
-    return true;
-  return false;
+	std::string en) {
+	size_t p = fn.rfind('.');
+	std::string ext = p != fn.npos ? fn.substr(p) : fn;
+	std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+	std::transform(en.begin(), en.end(), en.begin(), ::tolower);
+	if (ext == en)
+		return true;
+	if (en == "jpg" && ext == "jpeg")
+		return true;
+	return false;
 }
 bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum) {
-  cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
-  if (cv_img.data) {
-    if (encoding.size()) {
-      if ( (cv_img.channels() == 3) == is_color && !height && !width &&
-          matchExt(filename, encoding) )
-        return ReadFileToDatum(filename, label, datum);
-      std::vector<uchar> buf;
-      cv::imencode("."+encoding, cv_img, buf);
-      datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
-                      buf.size()));
-      datum->set_label(label);
-      datum->set_encoded(true);
-      return true;
-    }
-    CVMatToDatum(cv_img, datum);
-    datum->set_label(label);
-    return true;
-  } else {
-    return false;
-  }
+	const int height, const int width, const bool is_color,
+	const std::string & encoding, Datum* datum) {
+	cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
+	if (cv_img.data) {
+		if (encoding.size()) {
+			if ((cv_img.channels() == 3) == is_color && !height && !width &&
+				matchExt(filename, encoding))
+				return ReadFileToDatum(filename, label, datum);
+			std::vector < uchar > buf;
+			cv::imencode("." + encoding, cv_img, buf);
+			datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
+				buf.size()));
+			datum->set_label(label);
+			datum->set_encoded(true);
+			return true;
+		}
+		CVMatToDatum(cv_img, datum);
+		datum->set_label(label);
+		return true;
+	} else {
+		return false;
+	}
 }
 
 bool ReadFileToDatum(const string& filename, const int label,
-    Datum* datum) {
-  std::streampos size;
+	Datum* datum) {
+	std::streampos size;
 
-  fstream file(filename.c_str(), ios::in|ios::binary|ios::ate);
-  if (file.is_open()) {
-    size = file.tellg();
-    std::string buffer(size, ' ');
-    file.seekg(0, ios::beg);
-    file.read(&buffer[0], size);
-    file.close();
-    datum->set_data(buffer);
-    datum->set_label(label);
-    datum->set_encoded(true);
-    return true;
-  } else {
-    return false;
-  }
+	fstream file(filename.c_str(), ios::in | ios::binary | ios::ate);
+	if (file.is_open()) {
+		size = file.tellg();
+		std::string buffer(size, ' ');
+		file.seekg(0, ios::beg);
+		file.read(&buffer[0], size);
+		file.close();
+		datum->set_data(buffer);
+		datum->set_label(label);
+		datum->set_encoded(true);
+		return true;
+	} else {
+		return false;
+	}
 }
 
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
-  cv::Mat cv_img;
-  CHECK(datum.encoded()) << "Datum not encoded";
-  const string& data = datum.data();
-  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-  cv_img = cv::imdecode(vec_data, -1);
-  if (!cv_img.data) {
-    LOG(ERROR) << "Could not decode datum ";
-  }
-  return cv_img;
+	cv::Mat cv_img;
+	CHECK(datum.encoded()) << "Datum not encoded";
+	const string& data = datum.data();
+	std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
+	cv_img = cv::imdecode(vec_data, -1);
+	if (!cv_img.data) {
+		LOG(ERROR) << "Could not decode datum ";
+	}
+	return cv_img;
 }
 cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) {
-  cv::Mat cv_img;
-  CHECK(datum.encoded()) << "Datum not encoded";
-  const string& data = datum.data();
-  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
-  cv_img = cv::imdecode(vec_data, cv_read_flag);
-  if (!cv_img.data) {
-    LOG(ERROR) << "Could not decode datum ";
-  }
-  return cv_img;
+	cv::Mat cv_img;
+	CHECK(datum.encoded()) << "Datum not encoded";
+	const string& data = datum.data();
+	std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
+	int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
+																	CV_LOAD_IMAGE_GRAYSCALE);
+	cv_img = cv::imdecode(vec_data, cv_read_flag);
+	if (!cv_img.data) {
+		LOG(ERROR) << "Could not decode datum ";
+	}
+	return cv_img;
 }
 
 // If Datum is encoded will decoded using DecodeDatumToCVMat and CVMatToDatum
 // If Datum is not encoded will do nothing
 bool DecodeDatumNative(Datum* datum) {
-  if (datum->encoded()) {
-    cv::Mat cv_img = DecodeDatumToCVMatNative((*datum));
-    CVMatToDatum(cv_img, datum);
-    return true;
-  } else {
-    return false;
-  }
+	if (datum->encoded()) {
+		cv::Mat cv_img = DecodeDatumToCVMatNative((*datum));
+		CVMatToDatum(cv_img, datum);
+		return true;
+	} else {
+		return false;
+	}
 }
 bool DecodeDatum(Datum* datum, bool is_color) {
-  if (datum->encoded()) {
-    cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color);
-    CVMatToDatum(cv_img, datum);
-    return true;
-  } else {
-    return false;
-  }
+	if (datum->encoded()) {
+		cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color);
+		CVMatToDatum(cv_img, datum);
+		return true;
+	} else {
+		return false;
+	}
 }
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
-  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-  datum->set_channels(cv_img.channels());
-  datum->set_height(cv_img.rows);
-  datum->set_width(cv_img.cols);
-  datum->clear_data();
-  datum->clear_float_data();
-  datum->set_encoded(false);
-  int datum_channels = datum->channels();
-  int datum_height = datum->height();
-  int datum_width = datum->width();
-  int datum_size = datum_channels * datum_height * datum_width;
-  std::string buffer(datum_size, ' ');
-  for (int h = 0; h < datum_height; ++h) {
-    const uchar* ptr = cv_img.ptr<uchar>(h);
-    int img_index = 0;
-    for (int w = 0; w < datum_width; ++w) {
-      for (int c = 0; c < datum_channels; ++c) {
-        int datum_index = (c * datum_height + h) * datum_width + w;
-        buffer[datum_index] = static_cast<char>(ptr[img_index++]);
-      }
-    }
-  }
-  datum->set_data(buffer);
+	CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+	datum->set_channels(cv_img.channels());
+	datum->set_height(cv_img.rows);
+	datum->set_width(cv_img.cols);
+	datum->clear_data();
+	datum->clear_float_data();
+	datum->set_encoded(false);
+	int datum_channels = datum->channels();
+	int datum_height = datum->height();
+	int datum_width = datum->width();
+	int datum_size = datum_channels * datum_height * datum_width;
+	std::string buffer(datum_size, ' ');
+	for (int h = 0; h < datum_height; ++h) {
+		const uchar* ptr = cv_img.ptr < uchar > (h);
+		int img_index = 0;
+		for (int w = 0; w < datum_width; ++w) {
+			for (int c = 0; c < datum_channels; ++c) {
+				int datum_index = (c * datum_height + h) * datum_width + w;
+				buffer[datum_index] = static_cast<char>(ptr[img_index++]);
+			}
+		}
+	}
+	datum->set_data(buffer);
 }
 
 // Verifies format of data stored in HDF5 file and reshapes blob accordingly.
-template <typename Dtype>
+template<typename Dtype>
 void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
-  // Verify that the dataset exists.
-  CHECK(H5LTfind_dataset(file_id, dataset_name_))
-      << "Failed to find HDF5 dataset " << dataset_name_;
-  // Verify that the number of dimensions is in the accepted range.
-  herr_t status;
-  int ndims;
-  status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
-  CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
-  CHECK_GE(ndims, min_dim);
-  CHECK_LE(ndims, max_dim);
+	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+	Blob<Dtype>* blob) {
+	// Verify that the dataset exists.
+	CHECK(H5LTfind_dataset(file_id, dataset_name_))
+		<< "Failed to find HDF5 dataset " << dataset_name_;
+	// Verify that the number of dimensions is in the accepted range.
+	herr_t status;
+	int ndims;
+	status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
+	CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
+	CHECK_GE(ndims, min_dim);
+	CHECK_LE(ndims, max_dim);
 
-  // Verify that the data format is what we expect: float or double.
-  std::vector<hsize_t> dims(ndims);
-  H5T_class_t class_;
-  status = H5LTget_dataset_info(
-      file_id, dataset_name_, dims.data(), &class_, NULL);
-  CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
-  CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
+	// Verify that the data format is what we expect: float or double.
+	std::vector < hsize_t > dims(ndims);
+	H5T_class_t class_;
+	status = H5LTget_dataset_info(
+		file_id, dataset_name_, dims.data(), &class_, NULL);
+	CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
+	CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
 
-  vector<int> blob_dims(dims.size());
-  for (int i = 0; i < dims.size(); ++i) {
-    blob_dims[i] = dims[i];
-  }
-  blob->Reshape(blob_dims);
+	vector<int> blob_dims(dims.size());
+	for (int i = 0; i < dims.size(); ++i) {
+		blob_dims[i] = dims[i];
+	}
+	blob->Reshape(blob_dims);
 }
 
-template <>
+template<>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_float(
-    file_id, dataset_name_, blob->mutable_cpu_data());
-  CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
+	int min_dim, int max_dim, Blob<float>* blob) {
+	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+	herr_t status = H5LTread_dataset_float(
+		file_id, dataset_name_, blob->mutable_cpu_data());
+	CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
 }
 
-template <>
+template<>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_double(
-    file_id, dataset_name_, blob->mutable_cpu_data());
-  CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
+	int min_dim, int max_dim, Blob<double>* blob) {
+	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+	herr_t status = H5LTread_dataset_double(
+		file_id, dataset_name_, blob->mutable_cpu_data());
+	CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
 }
 
-template <>
+template<>
 void hdf5_save_nd_dataset<float>(
-    const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
-  hsize_t dims[HDF5_NUM_DIMS];
-  dims[0] = blob.num();
-  dims[1] = blob.channels();
-  dims[2] = blob.height();
-  dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_float(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-  CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
+	const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
+	hsize_t dims[HDF5_NUM_DIMS];
+	dims[0] = blob.num();
+	dims[1] = blob.channels();
+	dims[2] = blob.height();
+	dims[3] = blob.width();
+	herr_t status = H5LTmake_dataset_float(
+		file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+	CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
 }
 
-template <>
+template<>
 void hdf5_save_nd_dataset<double>(
-    const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
-  hsize_t dims[HDF5_NUM_DIMS];
-  dims[0] = blob.num();
-  dims[1] = blob.channels();
-  dims[2] = blob.height();
-  dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_double(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-  CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
+	const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
+	hsize_t dims[HDF5_NUM_DIMS];
+	dims[0] = blob.num();
+	dims[1] = blob.channels();
+	dims[2] = blob.height();
+	dims[3] = blob.width();
+	herr_t status = H5LTmake_dataset_double(
+		file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+	CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 80843191..61162be6 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -42,426 +42,493 @@ namespace caffe {
 
 template<>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-      ldb, beta, C, N);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const float alpha, const float* A, const float* B, const float beta,
+	float* C) {
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+		ldb, beta, C, N);
 }
 
 template<>
 void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-      ldb, beta, C, N);
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const double alpha, const double* A, const double* B, const double beta,
+	double* C) {
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+		ldb, beta, C, N);
 }
 
-template <>
+template<>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const float alpha, const float* A, const float* B, const float beta,
+	float* C) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	//AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+	CLBLAS_CHECK(
+		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0,
+			ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template<>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const double alpha, const double* A, const double* B, const double beta,
+	double* C) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	CLBLAS_CHECK(
+		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0,
+			ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template<>
 cl_event caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
-    cl_event event;
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) );
-    return event;
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const float alpha, const float* A, const int offA, const float* B,
+	const int offB, const float beta, float* C, const int offC) {
+	cl_event event;
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	CLBLAS_CHECK(
+		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
+			offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
+	return event;
+}
+
+template<>
 cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
-    cl_event event;
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event) );
-    return event;
-}
-
-
-template <>
-cl_event caffe_gpu_gemm<float>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A,const int offA, const float* B, const int offB, const float beta, float* C, const int offC) {
-    cl_event event;
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
-    return event;
- }
-
-template <>
-cl_event caffe_gpu_gemm<double>(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A,const int offA, const double* B, const int offB, const double beta, double* C, const int offC) {
-    cl_event event;
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    int lda = (TransA == CblasNoTrans) ? K : M;
-    int ldb = (TransB == CblasNoTrans) ? N : K;
-    int ldc = N;
-    CLBLAS_CHECK( clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, offB, ldb, (cl_mem)A, offA, lda, (cl_float)beta, (cl_mem)C, offC, ldc, 1, queue, 0, NULL, &event) );
-    return event;
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const double alpha, const double* A, const int offA, const double* B,
+	const int offB, const double beta, double* C, const int offC) {
+	cl_event event;
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	CLBLAS_CHECK(
+		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
+			offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
+	return event;
+}
+
+template<>
+cl_event caffe_gpu_gemm<float>(cl_command_queue *queue,
+	const CBLAS_TRANSPOSE TransA,
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const float alpha, const float* A, const int offA, const float* B,
+	const int offB, const float beta, float* C, const int offC) {
+	cl_event event;
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	CLBLAS_CHECK(
+		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
+			offC, ldc, 1, queue, 0, NULL, &event));
+	return event;
+}
+
+template<>
+cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
+	const CBLAS_TRANSPOSE TransA,
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const double alpha, const double* A, const int offA, const double* B,
+	const int offB, const double beta, double* C, const int offC) {
+	cl_event event;
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	clblasTranspose transB =
+		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	int ldc = N;
+	CLBLAS_CHECK(
+		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
+			offC, ldc, 1, queue, 0, NULL, &event));
+	return event;
+}
+
+template<>
 void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, const float* x,
-    const float beta, float* y) {
-  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+	const int N, const float alpha, const float* A, const float* x,
+	const float beta, float* y) {
+	cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-template <>
+template<>
 void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, const double* x,
-    const double beta, double* y) {
-  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+	const int N, const double alpha, const double* A, const double* x,
+	const double beta, double* y) {
+	cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-template <>
+template<>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, size_t offA, int lda, 
-    const float* x, size_t offx, const float beta, int incx, 
-    float* y, size_t offy, int incy) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA,
-                                  M, N, (cl_float)alpha, (cl_mem)A, offA, lda,
-                                  (cl_mem)x, offx, incx, (cl_float)beta, 
-                                  (cl_mem)y, offy, incy,
-                                  1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-}
-
-template <>
+	const int N, const float alpha, const float* A, size_t offA, int lda,
+	const float* x, size_t offx, const float beta, int incx,
+	float* y, size_t offy, int incy) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
+		M, N, (cl_float) alpha, (cl_mem) A, offA, lda,
+		(cl_mem) x, offx, incx, (cl_float) beta,
+		(cl_mem) y, offy, incy,
+		1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template<>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, size_t offA, int lda,
-    const double* x, size_t offx, const double beta, int incx,
-    double* y, size_t offy, int incy) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, offA, lda, (cl_mem)x, offx, incx, (cl_double)beta, (cl_mem)y, offy, incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+	const int N, const double alpha, const double* A, size_t offA, int lda,
+	const double* x, size_t offx, const double beta, int incx,
+	double* y, size_t offy, int incy) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	CLBLAS_CHECK(
+		clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
+			offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
+			incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 
 }
 
-
-template <>
+template<>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, const float* x,
-    const float beta, float* y) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    CLBLAS_CHECK( clblasSgemv(amdDevice.row, transA,
-                                  M, N, (cl_float)alpha, (cl_mem)A, 0, N,
-                                  (cl_mem)x, 0, 1, (cl_float)beta,
-                                  (cl_mem)y, 0, 1,
-                                  1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+	const int N, const float alpha, const float* A, const float* x,
+	const float beta, float* y) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
+		M, N, (cl_float) alpha, (cl_mem) A, 0, N,
+		(cl_mem) x, 0, 1, (cl_float) beta,
+		(cl_mem) y, 0, 1,
+		1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, const double* x,
-    const double beta, double* y) {
-    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
-    CLBLAS_CHECK( clblasDgemv(amdDevice.row, transA, M, N, (cl_double)alpha, (cl_mem)A, 0, N, (cl_mem)x, 0, 1, (cl_double)beta, (cl_mem)y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+	const int N, const double alpha, const double* A, const double* x,
+	const double beta, double* y) {
+	clblasTranspose transA =
+		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+	CLBLAS_CHECK(
+		clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
+			N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
+			&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
+	float* Y) {
+	cblas_saxpy(N, alpha, X, 1, Y, 1);
+}
 
-template <>
+template<>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
+	double* Y) {
+	cblas_daxpy(N, alpha, X, 1, Y, 1);
+}
 
-template <>
+template<>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) {
-    CLBLAS_CHECK( clblasSaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) );
+	float* Y) {
+	CLBLAS_CHECK(
+		clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+			&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) {
-    CLBLAS_CHECK( clblasDaxpy(N, alpha, (cl_mem)X, 0, 1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue),0, NULL, NULL) );
+	double* Y) {
+	CLBLAS_CHECK(
+		clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+			&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template<>
 void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y)
-{
+	{
 }
 
 template<>
 void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y)
-{
+	{
 }
 
 template<>
 void caffe_gpu_abs<float>(const int n, const float* x, float* y)
-{
-    caffe_gpu_abs_ocl(n, x, y);
+	{
+	caffe_gpu_abs_ocl(n, x, y);
 }
 
 template<>
 void caffe_gpu_abs<double>(const int n, const double* x, double* y)
-{
-    caffe_gpu_abs_ocl(n, x, y);
+	{
+	caffe_gpu_abs_ocl(n, x, y);
 }
 
-template <>
+template<>
 void caffe_set(const int N, const float alpha, float* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(float) * N);
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
+	if (alpha == 0) {
+		memset(Y, 0, sizeof(float) * N);
+		return;
+	}
+	for (int i = 0; i < N; ++i) {
+		Y[i] = alpha;
+	}
 }
 
-template <>
+template<>
 void caffe_set(const int N, const double alpha, double* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(double) * N);
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
+	if (alpha == 0) {
+		memset(Y, 0, sizeof(double) * N);
+		return;
+	}
+	for (int i = 0; i < N; ++i) {
+		Y[i] = alpha;
+	}
 }
 
-template <>
+template<>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
-  for (int i = 0; i < N; ++i) {
-    Y[i] += alpha;
-  }
+	for (int i = 0; i < N; ++i) {
+		Y[i] += alpha;
+	}
 }
 
-template <>
+template<>
 void caffe_add_scalar(const int N, const double alpha, double* Y) {
-  for (int i = 0; i < N; ++i) {
-    Y[i] += alpha;
-  }
+	for (int i = 0; i < N; ++i) {
+		Y[i] += alpha;
+	}
 }
 
-template <>
+template<>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
-  cblas_scopy(N, X, 1, Y, 1);
+	cblas_scopy(N, X, 1, Y, 1);
 }
 
-template <>
+template<>
 void caffe_copy<double>(const int N, const double* X, double* Y) {
-  cblas_dcopy(N, X, 1, Y, 1);
+	cblas_dcopy(N, X, 1, Y, 1);
 }
 
 //template <typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
-{
-  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)X, CL_TRUE, 0, N, Y,0, NULL, NULL);  
+	{
+	clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
+		NULL, NULL);
 // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
 }
 /*
-template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
-template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
-template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
-template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
-*/
-template<> 
+ template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
+ template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
+ template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
+ template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
+ */
+template<>
 void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y)
-{  OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+	{
+	OCL_CHECK(
+		clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N,
+			0, NULL, NULL));
 }
 
-template<> 
+template<>
 void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y)
-{  OCL_CHECK (clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+	{
+	OCL_CHECK(
+		clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N,
+			0, NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-  if(X != Y){
-      CLBLAS_CHECK( clblasScopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-  }
+	if (X != Y) {
+		CLBLAS_CHECK(
+			clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+				&(amdDevice.CommandQueue), 0, NULL, NULL));
+	}
 }
 
-template <>
+template<>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-  if(X != Y){
-      CLBLAS_CHECK( clblasDcopy( N, (cl_mem)X, 0,1, (cl_mem)Y, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-  }
+	if (X != Y) {
+		CLBLAS_CHECK(
+			clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+				&(amdDevice.CommandQueue), 0, NULL, NULL));
+	}
 }
 
-template <>
+template<>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
-  cblas_sscal(N, alpha, X, 1);
+	cblas_sscal(N, alpha, X, 1);
 }
 
-template <>
+template<>
 void caffe_scal<double>(const int N, const double alpha, double *X) {
-  cblas_dscal(N, alpha, X, 1);
+	cblas_dscal(N, alpha, X, 1);
 }
 
-template <>
+template<>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-   CLBLAS_CHECK(clblasSscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+	CLBLAS_CHECK(
+		clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+			NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-  CLBLAS_CHECK(clblasDscal(N, alpha, (cl_mem)X, 0, 1, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+	CLBLAS_CHECK(
+		clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+			NULL, NULL));
 }
 
-template <>
+template<>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  caffe_gpu_scal<float>(N, beta, Y);
-  caffe_gpu_axpy<float>(N, alpha, X, Y);
+	const float beta, float* Y) {
+	caffe_gpu_scal<float>(N, beta, Y);
+	caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  caffe_gpu_scal<double>(N, beta, Y);
-  caffe_gpu_axpy<double>(N, alpha, X, Y);
+	const double beta, double* Y) {
+	caffe_gpu_scal<double>(N, beta, Y);
+	caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
-template <>
+template<>
 void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
-                            const float beta, float* Y) {
-  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+	const float beta, float* Y) {
+	cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
-template <>
+template<>
 void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
-                             const double beta, double* Y) {
-  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+	const double beta, double* Y) {
+	cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
-template <>
+template<>
 void caffe_add<float>(const int n, const float* a, const float* b,
-    float* y) {
-  vsAdd(n, a, b, y);
+	float* y) {
+	vsAdd(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_add<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdAdd(n, a, b, y);
+	double* y) {
+	vdAdd(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_sub<float>(const int n, const float* a, const float* b,
-    float* y) {
-  vsSub(n, a, b, y);
+	float* y) {
+	vsSub(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdSub(n, a, b, y);
+	double* y) {
+	vdSub(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_mul<float>(const int n, const float* a, const float* b,
-    float* y) {
-  vsMul(n, a, b, y);
+	float* y) {
+	vsMul(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdMul(n, a, b, y);
+	double* y) {
+	vdMul(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_div<float>(const int n, const float* a, const float* b,
-    float* y) {
-  vsDiv(n, a, b, y);
+	float* y) {
+	vsDiv(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_div<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdDiv(n, a, b, y);
+	double* y) {
+	vdDiv(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_powx<float>(const int n, const float* a, const float b,
-    float* y) {
-  vsPowx(n, a, b, y);
+	float* y) {
+	vsPowx(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_powx<double>(const int n, const double* a, const double b,
-    double* y) {
-  vdPowx(n, a, b, y);
+	double* y) {
+	vdPowx(n, a, b, y);
 }
 
-template <>
+template<>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-  vsSqr(n, a, y);
+	vsSqr(n, a, y);
 }
 
-template <>
+template<>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-  vdSqr(n, a, y);
+	vdSqr(n, a, y);
 }
 
-template <>
+template<>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-  vsExp(n, a, y);
+	vsExp(n, a, y);
 }
 
-template <>
+template<>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-  vdExp(n, a, y);
+	vdExp(n, a, y);
 }
 
 unsigned int caffe_rng_rand() {
-  return (*caffe_rng())();
+	return (*caffe_rng())();
 }
 
-template <typename Dtype>
+template<typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype>(
-      b, std::numeric_limits<Dtype>::max());
+	return boost::math::nextafter < Dtype > (
+		b, std::numeric_limits < Dtype > ::max());
 }
 
 template
@@ -470,65 +537,66 @@ float caffe_nextafter(const float b);
 template
 double caffe_nextafter(const double b);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_LE(a, b);
-  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
-  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
-  }
+	CHECK_GE(n, 0);
+	CHECK(r);
+	CHECK_LE(a, b);
+	boost::uniform_real < Dtype
+		> random_distribution(a, caffe_nextafter<Dtype>(b));
+	boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
+	variate_generator(caffe_rng(), random_distribution);
+	for (int i = 0; i < n; ++i) {
+		r[i] = variate_generator();
+	}
 
-  //LOG(INFO) << "caffe_rng_uniform";
+	//LOG(INFO) << "caffe_rng_uniform";
 }
 
 template
 void caffe_rng_uniform<float>(const int n, const float a, const float b,
-                              float* r);
+	float* r);
 
 template
 void caffe_rng_uniform<double>(const int n, const double a, const double b,
-                               double* r);
+	double* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype a,
-                        const Dtype sigma, Dtype* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GT(sigma, 0);
-  boost::normal_distribution<Dtype> random_distribution(a, sigma);
-  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
-      //variate_generator(37, random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
-  }
-  //LOG(INFO) << "caffe_rng_guassian";
+	const Dtype sigma, Dtype* r) {
+	CHECK_GE(n, 0);
+	CHECK(r);
+	CHECK_GT(sigma, 0);
+	boost::normal_distribution < Dtype > random_distribution(a, sigma);
+	boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
+	variate_generator(caffe_rng(), random_distribution);
+	//variate_generator(37, random_distribution);
+	for (int i = 0; i < n; ++i) {
+		r[i] = variate_generator();
+	}
+	//LOG(INFO) << "caffe_rng_guassian";
 }
 
 template
 void caffe_rng_gaussian<float>(const int n, const float mu,
-                               const float sigma, float* r);
+	const float sigma, float* r);
 
 template
 void caffe_rng_gaussian<double>(const int n, const double mu,
-                                const double sigma, double* r);
+	const double sigma, double* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GE(p, 0);
-  CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
-  }
+	CHECK_GE(n, 0);
+	CHECK(r);
+	CHECK_GE(p, 0);
+	CHECK_LE(p, 1);
+	boost::bernoulli_distribution < Dtype > random_distribution(p);
+	boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+	variate_generator(caffe_rng(), random_distribution);
+	for (int i = 0; i < n; ++i) {
+		r[i] = variate_generator();
+	}
 }
 
 template
@@ -537,18 +605,18 @@ void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GE(p, 0);
-  CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = static_cast<unsigned int>(variate_generator());
-  }
+	CHECK_GE(n, 0);
+	CHECK(r);
+	CHECK_GE(p, 0);
+	CHECK_LE(p, 1);
+	boost::bernoulli_distribution < Dtype > random_distribution(p);
+	boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+	variate_generator(caffe_rng(), random_distribution);
+	for (int i = 0; i < n; ++i) {
+		r[i] = static_cast<unsigned int>(variate_generator());
+	}
 }
 
 template
@@ -557,365 +625,375 @@ void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
 //
-template <>
+template<>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
+	return cblas_sdot(n, x, 1, y, 1);
 }
 
-template <>
+template<>
 double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
+	return cblas_ddot(n, x, 1, y, 1);
 }
 
-template <>
+template<>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-    float* out) {
-    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(float)), NULL, NULL);
-    cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(float)), NULL, NULL);
-    clblasSdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
-    clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), out,0, NULL, NULL);
-    clReleaseMemObject(scratchBuff);
-    clReleaseMemObject(d_out);
+	float* out) {
+	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(n * sizeof(float)), NULL, NULL);
+	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(1 * sizeof(float)), NULL, NULL);
+	clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+		&(amdDevice.CommandQueue), 0, NULL, NULL);
+	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
+		out, 0, NULL, NULL);
+	clReleaseMemObject(scratchBuff);
+	clReleaseMemObject(d_out);
 }
 
-template <>
+template<>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-    double * out) {
-  //need to pass in scratchBuff
-  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
-    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(double)), NULL, NULL);
-    cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(double)), NULL, NULL);
-    clblasDdot(n,d_out,0,(cl_mem)x,0,1,(cl_mem)y, 0, 1, scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
-    clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), out,0, NULL, NULL);
-    clReleaseMemObject(scratchBuff);
-    clReleaseMemObject(d_out);
-}
-
-template <>
+	double * out) {
+	//need to pass in scratchBuff
+	//AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(n * sizeof(double)), NULL, NULL);
+	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(1 * sizeof(double)), NULL, NULL);
+	clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+		&(amdDevice.CommandQueue), 0, NULL, NULL);
+	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
+		out, 0, NULL, NULL);
+	clReleaseMemObject(scratchBuff);
+	clReleaseMemObject(d_out);
+}
+
+template<>
 int caffe_cpu_hamming_distance<float>(const int n, const float* x,
-                                  const float* y) {
-  int dist = 0;
-  for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
-                               static_cast<uint32_t>(y[i]));
-  }
-  return dist;
+	const float* y) {
+	int dist = 0;
+	for (int i = 0; i < n; ++i) {
+		dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
+			static_cast<uint32_t>(y[i]));
+	}
+	return dist;
 }
 
-template <>
+template<>
 int caffe_cpu_hamming_distance<double>(const int n, const double* x,
-                                   const double* y) {
-  int dist = 0;
-  for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
-                                static_cast<uint64_t>(y[i]));
-  }
-  return dist;
+	const double* y) {
+	int dist = 0;
+	for (int i = 0; i < n; ++i) {
+		dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
+			static_cast<uint64_t>(y[i]));
+	}
+	return dist;
 }
 
-template <>
+template<>
 float caffe_cpu_asum<float>(const int n, const float* x) {
-  return cblas_sasum(n, x, 1);
+	return cblas_sasum(n, x, 1);
 }
 
-template <>
+template<>
 double caffe_cpu_asum<double>(const int n, const double* x) {
-  return cblas_dasum(n, x, 1);
+	return cblas_dasum(n, x, 1);
 }
 
-template <>
+template<>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
-    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_float)), NULL, NULL);
-    cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_float)), NULL, NULL);
-    clblasSasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
-    clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,0, NULL, NULL);
-    clReleaseMemObject(scratchBuff);
-    clReleaseMemObject(d_y);
+	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(n * sizeof(cl_float)), NULL, NULL);
+	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(1 * sizeof(cl_float)), NULL, NULL);
+	clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+		&(amdDevice.CommandQueue), 0, NULL, NULL);
+	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
+		0, NULL, NULL);
+	clReleaseMemObject(scratchBuff);
+	clReleaseMemObject(d_y);
 }
 
-template <>
+template<>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
-    cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (n*sizeof(cl_double)), NULL, NULL);
-    cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, (1*sizeof(cl_double)), NULL, NULL);
-    clblasDasum(n,d_y,0,(cl_mem)x,0,1,scratchBuff,1,&(amdDevice.CommandQueue),0,NULL,NULL);
-    clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), y,0, NULL, NULL);
-    clReleaseMemObject(scratchBuff);
-    clReleaseMemObject(d_y);
+	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(n * sizeof(cl_double)), NULL, NULL);
+	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+		(1 * sizeof(cl_double)), NULL, NULL);
+	clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+		&(amdDevice.CommandQueue), 0, NULL, NULL);
+	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
+		y, 0, NULL, NULL);
+	clReleaseMemObject(scratchBuff);
+	clReleaseMemObject(d_y);
 }
 
 //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-                                    //  - (x[index] < Dtype(0)));
+//  - (x[index] < Dtype(0)));
 //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
 
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sign);
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC(sgnbit);
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC(fabs);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
 
-template <>
+template<>
 void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
-  cblas_scopy(n, x, 1, y, 1);
-  cblas_sscal(n, alpha, y, 1);
+	float* y) {
+	cblas_scopy(n, x, 1, y, 1);
+	cblas_sscal(n, alpha, y, 1);
 }
 
-template <>
+template<>
 void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
-  cblas_dcopy(n, x, 1, y, 1);
-  cblas_dscal(n, alpha, y, 1);
+	double* y) {
+	cblas_dcopy(n, x, 1, y, 1);
+	cblas_dscal(n, alpha, y, 1);
 }
 
-template <>
+template<>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
-  caffe_gpu_copy(n, x, y);
-  caffe_gpu_scal(n, alpha, y);
+	float* y) {
+	caffe_gpu_copy(n, x, y);
+	caffe_gpu_scal(n, alpha, y);
 }
 
-template <>
+template<>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
-  caffe_gpu_copy(n, x, y);
-  caffe_gpu_scal(n, alpha, y);
+	double* y) {
+	caffe_gpu_copy(n, x, y);
+	caffe_gpu_scal(n, alpha, y);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
-template <>
+template<>
 void caffe_gpu_set<float>(const int N, const float alpha, float* Y) {
-  ocl_memset(Y, alpha, N);
+	ocl_memset(Y, alpha, N);
 }
 
-template <>
+template<>
 void caffe_gpu_set<double>(const int N, const double alpha, double* Y) {
-  ocl_memset(Y, alpha, N);
+	ocl_memset(Y, alpha, N);
 }
 
-template <>
+template<>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
-  kernel_add_scalar(N, alpha, Y);
+	kernel_add_scalar(N, alpha, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
-  kernel_add_scalar(N, alpha, Y);
+	kernel_add_scalar(N, alpha, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
-  kernel_exp(N, a, y);
+	kernel_exp(N, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
-  kernel_exp(N, a, y);
+	kernel_exp(N, a, y);
 }
 
 template<>
-void caffe_gpu_sign<float>(const int N, const float *X, float *Y){
-   caffe_gpu_sign_ocl(N, X, Y);
+void caffe_gpu_sign<float>(const int N, const float *X, float *Y) {
+	caffe_gpu_sign_ocl(N, X, Y);
 }
 
 template<>
-void caffe_gpu_sign<double>(const int N, const double *X, double *Y){
-   caffe_gpu_sign_ocl(N, X, Y);
+void caffe_gpu_sign<double>(const int N, const double *X, double *Y) {
+	caffe_gpu_sign_ocl(N, X, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-    float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_sub(N, a, b, y);
+	float* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_sub(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-    double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_sub(N, a, b, y);
+	double* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_sub(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_mul<float>(const int N, const float* a,
-    const float* b, float* y) {
-  kernel_mul(N, a, b, y);
+	const float* b, float* y) {
+	kernel_mul(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_mul<double>(const int N, const double* a,
-    const double* b, double* y) {
-  kernel_mul(N, a, b, y);
+	const double* b, double* y) {
+	kernel_mul(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_div<float>(const int N, const float* a,
-    const float* b, float* y) {
-  kernel_div(N, a, b, y);
+	const float* b, float* y) {
+	kernel_div(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_div<double>(const int N, const double* a,
-    const double* b, double* y) {
-  kernel_div(N, a, b, y);
+	const double* b, double* y) {
+	kernel_div(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_powx<float>(const int N, const float* a,
-    const float alpha, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_powx(N, a, alpha, y);
+	const float alpha, float* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_powx(N, a, alpha, y);
 }
 
-template <>
+template<>
 void caffe_gpu_powx<double>(const int N, const double* a,
-    const double alpha, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_powx(N, a, alpha, y);
+	const double alpha, double* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_powx(N, a, alpha, y);
 }
 
 void popc_kernel(const int n, const float* a,
-    const float* b, uint8_t* y) {
+	const float* b, uint8_t* y) {
 }
 
 void popcll_kernel(const int n, const double* a,
-    const double* b, uint8_t* y) {
+	const double* b, uint8_t* y) {
 }
 
-template <>
+template<>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
-                                  const float* y) {
+	const float* y) {
 	return 0;
 }
 
-template <>
+template<>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
-                                   const double* y) {
+	const double* y) {
 	return 0;
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
 }
 
-template <>
+template<>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-                                  float* r) {
+	float* r) {
 }
-template <>
+template<>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-                                   double* r) {
+	double* r) {
 }
 
-template <>
+template<>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
-                            float* r) {
+	float* r) {
 }
 
-template <>
+template<>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
-                            double* r) {
+	double* r) {
 }
 
-template <>
+template<>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_log(N, a, y);
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_log(N, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_log(N, a, y);
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_log(N, a, y);
 }
 
-
-
-
-
-
-template <>
+template<>
 void caffe_log<float>(const int n, const float* a, float* y) {
-  vsLn(n, a, y);
+	vsLn(n, a, y);
 }
 
-template <>
+template<>
 void caffe_log<double>(const int n, const double* a, double* y) {
-  vdLn(n, a, y);
+	vdLn(n, a, y);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
-  if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
+	if (X != Y) {
+		if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+			// NOLINT_NEXT_LINE(caffe/alt_fn)
+			//CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
 #else
-      NO_GPU;
+			NO_GPU;
 #endif
-    } else {
-      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
-  }
+		} else {
+			memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+		}
+	}
 }
 
 template void caffe_copy<int>(const int N, const int* X, int* Y);
 template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-    unsigned int* Y);
+	unsigned int* Y);
 template void caffe_copy<float>(const int N, const float* X, float* Y);
 template void caffe_copy<double>(const int N, const double* X, double* Y);
 
-template <>
+template<>
 void caffe_abs<float>(const int n, const float* a, float* y) {
-    vsAbs(n, a, y);
+	vsAbs(n, a, y);
 }
 
-template <>
+template<>
 void caffe_abs<double>(const int n, const double* a, double* y) {
-    vdAbs(n, a, y);
+	vdAbs(n, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-    float* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_add(N, a, b, y);
+	float* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_add(N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-    double* y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_add(N, a, b, y);
+	double* y) {
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	kernel_add(N, a, b, y);
 }
 
-template <>
+template<>
 float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-    const float* y, const int incy) {
-  return cblas_sdot(n, x, incx, y, incy);
+	const float* y, const int incy) {
+	return cblas_sdot(n, x, incx, y, incy);
 }
 
-template <>
+template<>
 double caffe_cpu_strided_dot<double>(const int n, const double* x,
-    const int incx, const double* y, const int incy) {
-  return cblas_ddot(n, x, incx, y, incy);
+	const int incx, const double* y, const int incy) {
+	return cblas_ddot(n, x, incx, y, incy);
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
+	if (alpha == 0) {
+		memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+		return;
+	}
+	for (int i = 0; i < N; ++i) {
+		Y[i] = alpha;
+	}
 }
 
 template void caffe_set<int>(const int N, const int alpha, int* Y);
 template void caffe_set<float>(const int N, const float alpha, float* Y);
 template void caffe_set<double>(const int N, const double alpha, double* Y);
 
-
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 2631a074..1bf783e4 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -12,152 +12,152 @@
 
 namespace caffe {
 
-template <>
+template<>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const float alpha, const float* A, const float* B, const float beta,
+	float* C) {
+	// Note that cublas follows fortran order.
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cublasOperation_t cuTransA =
+		(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	cublasOperation_t cuTransB =
+		(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
+		N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+template<>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
-}
-
-template <>
+	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+	const double alpha, const double* A, const double* B, const double beta,
+	double* C) {
+	// Note that cublas follows fortran order.
+	int lda = (TransA == CblasNoTrans) ? K : M;
+	int ldb = (TransB == CblasNoTrans) ? N : K;
+	cublasOperation_t cuTransA =
+		(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	cublasOperation_t cuTransB =
+		(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+	CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
+		N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+template<>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, const float* x,
-    const float beta, float* y) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
+	const int N, const float alpha, const float* A, const float* x,
+	const float beta, float* y) {
+	cublasOperation_t cuTransA =
+		(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+	CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
+		A, N, x, 1, &beta, y, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, const double* x,
-    const double beta, double* y) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
+	const int N, const double alpha, const double* A, const double* x,
+	const double beta, double* y) {
+	cublasOperation_t cuTransA =
+		(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+	CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
+		A, N, x, 1, &beta, y, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) {
-  CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
+	float* Y) {
+	CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) {
-  CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
+	double* Y) {
+	CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
 void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) {
-  if (X != Y) {
-    CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault));  // NOLINT(caffe/alt_fn)
-  }
+	if (X != Y) {
+		CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault));  // NOLINT(caffe/alt_fn)
+	}
 }
 
-template <>
+template<>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  caffe_gpu_scal<float>(N, beta, Y);
-  caffe_gpu_axpy<float>(N, alpha, X, Y);
+	const float beta, float* Y) {
+	caffe_gpu_scal<float>(N, beta, Y);
+	caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  caffe_gpu_scal<double>(N, beta, Y);
-  caffe_gpu_axpy<double>(N, alpha, X, Y);
+	const double beta, double* Y) {
+	caffe_gpu_scal<double>(N, beta, Y);
+	caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-    float* out) {
-  CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+	float* out) {
+	CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
-template <>
+template<>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-    double * out) {
-  CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+	double * out) {
+	CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
-template <>
+template<>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
-  CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+	CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
-template <>
+template<>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
-  CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
+	CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
-template <>
+template<>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
-  CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+	float* y) {
+	CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
-template <>
+template<>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
-  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+	double* y) {
+	CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
 template <typename Dtype>
 __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = alpha;
-  }
+	CUDA_KERNEL_LOOP(index, n) {
+		y[index] = alpha;
+	}
 }
 
-template <typename Dtype>
+template<typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
-  if (alpha == 0) {
-    CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
-    return;
-  }
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+	if (alpha == 0) {
+		CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
+		return;
+	}
+	// NOLINT_NEXT_LINE(whitespace/operators)
+set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+	N, alpha, Y);
 }
 
 template void caffe_gpu_set<int>(const int N, const int alpha, int* Y);
@@ -166,300 +166,301 @@ template void caffe_gpu_set<double>(const int N, const double alpha, double* Y);
 
 template <typename Dtype>
 __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] += alpha;
-  }
+CUDA_KERNEL_LOOP(index, n) {
+	y[index] += alpha;
+}
 }
 
-template <>
+template<>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+// NOLINT_NEXT_LINE(whitespace/operators)
+add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, alpha, Y);
 }
 
-template <>
+template<>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, alpha, Y);
 }
 
 template <typename Dtype>
 __global__ void add_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] + b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] + b[index];
+}
 }
 
-template <>
+template<>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-    float* y) {
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-    double* y) {
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void sub_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] - b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] - b[index];
+}
 }
 
-template <>
+template<>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-    float* y) {
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-    double* y) {
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+sub_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void mul_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] * b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] * b[index];
+}
 }
 
-template <>
+template<>
 void caffe_gpu_mul<float>(const int N, const float* a,
-    const float* b, float* y) {
+const float* b, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_mul<double>(const int N, const double* a,
-    const double* b, double* y) {
+const double* b, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void div_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] / b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] / b[index];
+}
 }
 
-template <>
+template<>
 void caffe_gpu_div<float>(const int N, const float* a,
-    const float* b, float* y) {
+const float* b, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
-template <>
+template<>
 void caffe_gpu_div<double>(const int N, const double* a,
-    const double* b, double* y) {
+const double* b, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = abs(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = abs(a[index]);
+}
 }
 
-template <>
+template<>
 void caffe_gpu_abs<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_abs<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
-
 template <typename Dtype>
 __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = exp(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = exp(a[index]);
+}
 }
 
-template <>
+template<>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <typename Dtype>
 __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = log(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = log(a[index]);
+}
 }
 
-template <>
+template<>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
-template <>
+template<>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <typename Dtype>
 __global__ void powx_kernel(const int n, const Dtype* a,
-    const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = pow(a[index], alpha);
-  }
+const Dtype alpha, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = pow(a[index], alpha);
+}
 }
 
-template <>
+template<>
 void caffe_gpu_powx<float>(const int N, const float* a,
-    const float alpha, float* y) {
+const float alpha, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
+powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, alpha, y);
 }
 
-template <>
+template<>
 void caffe_gpu_powx<double>(const int N, const double* a,
-    const double alpha, double* y) {
+const double alpha, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
+powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, alpha, y);
 }
 
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-                                      - (x[index] < Dtype(0)));
+- (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
 
 __global__ void popc_kernel(const int n, const float* a,
-    const float* b, uint8_t* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = __popc(static_cast<uint32_t>(a[index]) ^
-                      static_cast<uint32_t>(b[index]));
-  }
+const float* b, uint8_t* y) {
+CUDA_KERNEL_LOOP(index, n)
+{
+y[index] = __popc(static_cast<uint32_t>(a[index]) ^
+static_cast<uint32_t>(b[index]));
+}
 }
 
 __global__ void popcll_kernel(const int n, const double* a,
-    const double* b, uint8_t* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = __popcll(static_cast<uint64_t>(a[index]) ^
-                      static_cast<uint64_t>(b[index]));
-  }
+const double* b, uint8_t* y) {
+CUDA_KERNEL_LOOP(index, n)
+{
+y[index] = __popcll(static_cast<uint64_t>(a[index]) ^
+static_cast<uint64_t>(b[index]));
+}
 }
 
-template <>
+template<>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
-                                  const float* y) {
+const float* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
-  // TestHammingDistanceGPU in test_math_functions.cpp).
-  NOT_IMPLEMENTED;
-  thrust::device_vector<uint8_t> popcounts(n);
+	// TestHammingDistanceGPU in test_math_functions.cpp).
+NOT_IMPLEMENTED;
+thrust::device_vector < uint8_t > popcounts(n);
   // NOLINT_NEXT_LINE(whitespace/operators)
-  popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y, thrust::raw_pointer_cast(popcounts.data()));
-  return thrust::reduce(popcounts.begin(), popcounts.end(),
-                        (uint32_t) 0, thrust::plus<uint32_t>());
+popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+n, x, y, thrust::raw_pointer_cast(popcounts.data()));
+return thrust::reduce(popcounts.begin(), popcounts.end(),
+(uint32_t) 0, thrust::plus<uint32_t>());
 }
 
-template <>
+template<>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
-                                   const double* y) {
+const double* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
-  // TestHammingDistanceGPU in test_math_functions.cpp).
-  NOT_IMPLEMENTED;
-  thrust::device_vector<uint8_t> popcounts(n);
+	// TestHammingDistanceGPU in test_math_functions.cpp).
+NOT_IMPLEMENTED;
+thrust::device_vector < uint8_t > popcounts(n);
   // NOLINT_NEXT_LINE(whitespace/operators)
-  popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y, thrust::raw_pointer_cast(popcounts.data()));
-  return thrust::reduce(popcounts.begin(), popcounts.end(),
-                        /* NOLINT_NEXT_LINE(build/include_what_you_use) */
-                        (uint32_t) 0, thrust::plus<uint32_t>());
+popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+n, x, y, thrust::raw_pointer_cast(popcounts.data()));
+return thrust::reduce(popcounts.begin(), popcounts.end(),
+/* NOLINT_NEXT_LINE(build/include_what_you_use) */
+(uint32_t) 0, thrust::plus<uint32_t>());
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
-  CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
+CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
 }
 
-template <>
+template<>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-                                  float* r) {
-  CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
-  const float range = b - a;
-  if (range != static_cast<float>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<float>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
-}
-
-template <>
+float* r) {
+CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
+const float range = b - a;
+if (range != static_cast<float>(1)) {
+caffe_gpu_scal(n, range, r);
+}
+if (a != static_cast<float>(0)) {
+caffe_gpu_add_scalar(n, a, r);
+}
+}
+
+template<>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-                                   double* r) {
-  CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
-  const double range = b - a;
-  if (range != static_cast<double>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<double>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
-}
-
-template <>
+double* r) {
+CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
+const double range = b - a;
+if (range != static_cast<double>(1)) {
+caffe_gpu_scal(n, range, r);
+}
+if (a != static_cast<double>(0)) {
+caffe_gpu_add_scalar(n, a, r);
+}
+}
+
+template<>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
-                            float* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
+float* r) {
+CURAND_CHECK(
+curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
-template <>
+template<>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
-                            double* r) {
-  CURAND_CHECK(
-      curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
+double* r) {
+CURAND_CHECK(
+curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 7f9631e2..6b8c5fee 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -32,51 +32,60 @@
 #include "caffe/common.hpp"
 #include "caffe/util/ocl_util.hpp"
 namespace caffe {
-template <typename dtype> extern std::string get_dtype_suffix();
+template<typename dtype> extern std::string get_dtype_suffix();
 
-template <typename Dtype>
-void ocl_memset(Dtype* buffer, const Dtype value, const int count){
-    std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int err=0;
-    err=clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
-    err|=clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&value);
-    err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
-    OCL_CHECK(err);
- 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+template<typename Dtype>
+void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
+	std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int err = 0;
+	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+	err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value);
+	err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+	OCL_CHECK(err);
+
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
 template void ocl_memset<int>(int* buffer, const int value, const int count);
-template void ocl_memset<float>(float* buffer, const float value, const int count);
-template void ocl_memset<double>(double* buffer, const double value, const int count);
-
+template void ocl_memset<float>(float* buffer, const float value,
+	const int count);
+template void ocl_memset<double>(double* buffer, const double value,
+	const int count);
 
-void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value, const int count){
-    cl_int err;
-    err =clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*)&buffer);
-    err|=clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&value);
-    err|=clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&count);
-    OCL_CHECK(err);
+void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
+	const int count) {
+	cl_int err;
+	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+	err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
+	err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+	OCL_CHECK(err);
 
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
-void eventCallback(cl_event event, cl_int event_status, void* user_data){
-    cl_ulong ev_start_time = (cl_ulong)0;
-    cl_ulong ev_end_time = (cl_ulong)0;
-    double run_time;
-    OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL) );
-    OCL_CHECK( clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL) );
-    run_time = (double)(ev_end_time - ev_start_time);
-    printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
+void eventCallback(cl_event event, cl_int event_status, void* user_data) {
+	cl_ulong ev_start_time = (cl_ulong) 0;
+	cl_ulong ev_end_time = (cl_ulong) 0;
+	double run_time;
+	OCL_CHECK(
+		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+			sizeof(cl_ulong), &ev_start_time, NULL));
+	OCL_CHECK(
+		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+			&ev_end_time, NULL));
+	run_time = (double) (ev_end_time - ev_start_time);
+	printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
 }
 
-
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index c8f28426..f7cf9c07 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -34,1433 +34,1840 @@
 #include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
 typedef unsigned int uint32_t;
-struct array4x32 {  uint32_t v[4]; };
-template <typename Dtype>
-void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold)
-{
-        std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
-        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-        static unsigned c = 0;
-        unsigned nrounds = 20;
-        array4x32  rndctr4;
-        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&inf);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&sup);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype),   (void*)&threshold);
-        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&nrounds);
-        ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint),    (void*)&size);
-        OCL_CHECK(ret);
-
-        size_t globalws[1] = {size};
-        size_t localws[1] = {256};
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
-}
-template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n, float inf, float sup, float threshold);
-template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n, double inf, double sup, double threshold);
-
-
-template <typename Dtype>
-void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){
-    std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret= clSetKernelArg(Kernel,0,sizeof(cl_mem),(void*)&src);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&dst);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&top_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&N_);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&M_);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&packing_num);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size2[]={(size_t)(M_ * packing_num)};
-    size_t uiLocal_Work_Size2[]={256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL) );
-}
-
-template void transform_gpu<float>(float* src, float* dst, const int top_offset, const int N_, const int M_, const int packing_num);
-template void transform_gpu<double>(double* src, double* dst, const int top_offset, const int N_, const int M_, const int packing_num);
-
-template <typename Dtype>
-void get_max_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* bottom_data, Dtype* scale_data){
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&scale_data) );
- 
-    size_t Global_Work_Size[1] = {(size_t)num};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* bottom_data, float* scale_data);
-template void get_max_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* bottom_data, double* scale_data);
-
-
-template <typename Dtype>
-void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out){
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
-
-    size_t Global_Work_Size[1] = {(size_t)num};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data, float* out);
-template void exp_gpu<double>(cl_kernel Kernel, const int num, const double* data, double* out);
-
-template <typename Dtype>
-void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* scale, Dtype* data){
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&scale) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
-
-    size_t Global_Work_Size[1] = {(size_t) (num * dim)};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void softmax_div_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* scale, float* data);
-template void softmax_div_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* scale, double* data);
-
-template <typename Dtype>
-Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, const Dtype* prob_data, const Dtype* label, cl_mem d_loss){
-
-    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem),     (void*)&prob_data));
-    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&d_loss));
-    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),   (void*)&label));
-    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int),   (void*)&num));
-    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),   (void*)&dim));
-    OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype),    NULL));
-
-    size_t globalws[1] = {256};
-    size_t localws[1] = {256};
-    OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, localws, 0, NULL, NULL) );
-    void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
-    Dtype loss = *(Dtype*)h_loss;
-    clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, NULL);
-    
-    return loss;
-}
-
-template float softmax_gpu<float>(cl_kernel Kernel, const int num, const int dim, const float* prob_data, const float* label, cl_mem d_loss);
-template double softmax_gpu<double>(cl_kernel Kernel, const int num, const int dim, const double* prob_data, const double* label, cl_mem d_loss);
-
-template <typename Dtype>
+struct array4x32 {
+		uint32_t v[4];
+};
+template<typename Dtype>
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
+	Dtype threshold)
+	{
+	std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
+	cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+	static unsigned c = 0;
+	unsigned nrounds = 20;
+	array4x32 rndctr4;
+	rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+	cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+	cl_int ret;
+	ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+	ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf);
+	ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup);
+	ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold);
+	ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds);
+	ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size);
+	OCL_CHECK(ret);
+
+	size_t globalws[1] = { size };
+	size_t localws[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws,
+			localws, 0, NULL, NULL));
+}
+template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n,
+	float inf, float sup, float threshold);
+template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n,
+	double inf, double sup, double threshold);
+
+template<typename Dtype>
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
+	const int M_, const int packing_num) {
+	std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) };
+	size_t uiLocal_Work_Size2[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+}
+
+template void transform_gpu<float>(float* src, float* dst, const int top_offset,
+	const int N_, const int M_, const int packing_num);
+template void transform_gpu<double>(double* src, double* dst,
+	const int top_offset, const int N_, const int M_, const int packing_num);
+
+template<typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* bottom_data, Dtype* scale_data) {
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data));
+
+	size_t Global_Work_Size[1] = { (size_t) num };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim,
+	const float* bottom_data, float* scale_data);
+template void get_max_gpu<double>(cl_kernel Kernel, const int num,
+	const int dim, const double* bottom_data, double* scale_data);
+
+template<typename Dtype>
+void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) {
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+	size_t Global_Work_Size[1] = { (size_t) num };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data,
+	float* out);
+template void exp_gpu<double>(cl_kernel Kernel, const int num,
+	const double* data, double* out);
+
+template<typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* scale, Dtype* data) {
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+
+	size_t Global_Work_Size[1] = { (size_t)(num * dim) };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void softmax_div_gpu<float>(cl_kernel Kernel, const int num,
+	const int dim, const float* scale, float* data);
+template void softmax_div_gpu<double>(cl_kernel Kernel, const int num,
+	const int dim, const double* scale, double* data);
+
+template<typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
+	const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL));
+
+	size_t globalws[1] = { 256 };
+	size_t localws[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
+			localws, 0, NULL, NULL));
+	void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE,
+		CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
+	Dtype loss = *(Dtype*) h_loss;
+	clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL,
+		NULL);
+
+	return loss;
+}
+
+template float softmax_gpu<float>(cl_kernel Kernel, const int num,
+	const int dim, const float* prob_data, const float* label, cl_mem d_loss);
+template double softmax_gpu<double>(cl_kernel Kernel, const int num,
+	const int dim, const double* prob_data, const double* label, cl_mem d_loss);
+
+template<typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out)
-{
-    std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&out) );
-
-    size_t Global_Work_Size[1] = {(size_t) (num*spatial_dim)};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void kernel_channel_max<float>( const int num, const int channels,
-    const int spatial_dim, const float* data, float* out);
-template void kernel_channel_max<double>( const int num, const int channels,
-    const int spatial_dim, const double* data, double* out);
-
-template <typename Dtype>
-void kernel_channel_subtract( const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data)
-{
-    std::string kernel_name = "kernel_channel_subtract" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_max) );
-    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
-
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void kernel_channel_subtract<float>( const int count,
-    const int num, const int channels,
-    const int spatial_dim, const float* channel_max, float* data);
-template void kernel_channel_subtract<double>( const int count,
-    const int num, const int channels,
-    const int spatial_dim, const double* channel_max, double* data);
-
-template <typename Dtype>
+	const int spatial_dim, const Dtype* data, Dtype* out)
+	{
+	std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out));
+
+	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_max<float>(const int num, const int channels,
+	const int spatial_dim, const float* data, float* out);
+template void kernel_channel_max<double>(const int num, const int channels,
+	const int spatial_dim, const double* data, double* out);
+
+template<typename Dtype>
+void kernel_channel_subtract(const int count,
+	const int num, const int channels,
+	const int spatial_dim, const Dtype* channel_max, Dtype* data)
+	{
+	std::string kernel_name = "kernel_channel_subtract"
+		+ get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_subtract<float>(const int count,
+	const int num, const int channels,
+	const int spatial_dim, const float* channel_max, float* data);
+template void kernel_channel_subtract<double>(const int count,
+	const int num, const int channels,
+	const int spatial_dim, const double* channel_max, double* data);
+
+template<typename Dtype>
 void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-{
-    std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void kernel_mul<float>(const int count, const float* a, const float* b, float* out);
-template void kernel_mul<double>(const int count, const double* a, const double* b, double* out);
+template void kernel_mul<float>(const int count, const float* a, const float* b,
+	float* out);
+template void kernel_mul<double>(const int count, const double* a,
+	const double* b, double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_add_scalar(const int count, const Dtype data, Dtype* out)
-{
-    std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
-
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void kernel_add_scalar<float>(const int count, const float data, float* out);
-template void kernel_add_scalar<double>(const int count, const double data, double* out);
-
-
-template <typename Dtype>
-void kernel_powx(const int count, const Dtype* data, const Dtype alpha, Dtype* out)
-{
-    std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
-
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void kernel_powx<float>(const int count, const float* data, const float alpha, float* out);
-template void kernel_powx<double>(const int count, const double* data, const double alpha, double* out);
-
-template <typename Dtype>
+	{
+	std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_add_scalar<float>(const int count, const float data,
+	float* out);
+template void kernel_add_scalar<double>(const int count, const double data,
+	double* out);
+
+template<typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
+	Dtype* out)
+	{
+	std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_powx<float>(const int count, const float* data,
+	const float alpha, float* out);
+template void kernel_powx<double>(const int count, const double* data,
+	const double alpha, double* out);
+
+template<typename Dtype>
 void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-{
-    std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void kernel_div<float>(const int count, const float* a, const float* b, float* out);
-template void kernel_div<double>(const int count, const double* a, const double* b, double* out);
+template void kernel_div<float>(const int count, const float* a, const float* b,
+	float* out);
+template void kernel_div<double>(const int count, const double* a,
+	const double* b, double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-{
-    std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void kernel_add<float>(const int count, const float* a, const float* b, float* out);
-template void kernel_add<double>(const int count, const double* a, const double* b, double* out);
+template void kernel_add<float>(const int count, const float* a, const float* b,
+	float* out);
+template void kernel_add<double>(const int count, const double* a,
+	const double* b, double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-{
-    std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void kernel_sub<float>(const int count, const float* a, const float* b, float* out);
-template void kernel_sub<double>(const int count, const double* a, const double* b, double* out);
+template void kernel_sub<float>(const int count, const float* a, const float* b,
+	float* out);
+template void kernel_sub<double>(const int count, const double* a,
+	const double* b, double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_log(const int count, const Dtype* data, Dtype* out)
-{
-    std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_log<float>(const int count, const float* data, float* out);
-template void kernel_log<double>(const int count, const double* data, double* out);
-
+template void kernel_log<double>(const int count, const double* data,
+	double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out)
-{
-    std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	{
+	std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&out) );
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_exp<float>(const int count, const float* data, float* out);
-template void kernel_exp<double>(const int count, const double* data, double* out);
+template void kernel_exp<double>(const int count, const double* data,
+	double* out);
 
-template <typename Dtype>
+template<typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum)
-{
-    std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
-
-    size_t Global_Work_Size[1] = {(size_t)(num*channels)};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void kernel_channel_sum<float>(const int num, const int channels, const int spatial_dim, const float* data, float* channel_sum);
-template void kernel_channel_sum<double>(const int num, const int channels, const int spatial_dim, const double* data, double* channel_sum);
-
-template <typename Dtype>
+	const int spatial_dim, const Dtype* data, Dtype* channel_sum)
+	{
+	std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+
+	size_t Global_Work_Size[1] = { (size_t)(num * channels) };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_sum<float>(const int num, const int channels,
+	const int spatial_dim, const float* data, float* channel_sum);
+template void kernel_channel_sum<double>(const int num, const int channels,
+	const int spatial_dim, const double* data, double* channel_sum);
+
+template<typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_sum, Dtype* data)
-{
-    std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&channels) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&spatial_dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&channel_sum) );
-    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&data) );
-
-    size_t Global_Work_Size[1] = {(size_t)count};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template  void kernel_channel_div<float>(const int count, const int num, const int channels,
-    const int spatial_dim, const float* channel_sum, float* data);
-template  void kernel_channel_div<double>(const int count, const int num, const int channels,
-    const int spatial_dim, const double* channel_sum, double* data);
-
-template <typename Dtype>
+	const int spatial_dim, const Dtype* channel_sum, Dtype* data)
+	{
+	std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+
+	size_t Global_Work_Size[1] = { (size_t) count };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_div<float>(const int count, const int num,
+	const int channels,
+	const int spatial_dim, const float* channel_sum, float* data);
+template void kernel_channel_div<double>(const int count, const int num,
+	const int channels,
+	const int spatial_dim, const double* channel_sum, double* data);
+
+template<typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot)
-{
-    std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&spatial_dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&data_1) );
-    OCL_CHECK( clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&data_2) );
-    OCL_CHECK( clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&channel_dot) );
-      
-    size_t Global_Work_Size[1] = {(size_t)(num*spatial_dim)};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
+	const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+	Dtype* channel_dot)
+	{
+	std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot));
+
+	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_dot<float>(const int num, const int channels,
-    const int spatial_dim, const float* data_1, const float* data_2, float* channel_dot);
+	const int spatial_dim, const float* data_1, const float* data_2,
+	float* channel_dot);
 template void kernel_channel_dot<double>(const int num, const int channels,
-    const int spatial_dim, const double* data_1, const double* data_2, double* channel_dot);
-
+	const int spatial_dim, const double* data_1, const double* data_2,
+	double* channel_dot);
 
-template <typename Dtype>
+template<typename Dtype>
 void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts)
-{
-    std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
-    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&prob_data));
-    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
-    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem),  (void*)&loss));
-    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),  (void*)&num));
-    OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int),  (void*)&dim));
-    OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int),  (void*)&spatial_dim));
-    OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool),  (void*)&has_ignore_label_));
-    OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
-    OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
-    
-   size_t Global_Work_Size[1] = {(size_t)nthreads};
-   size_t Local_Work_Size[1] = {256};
-   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void SoftmaxLossForwardGPU<float>(const int nthreads, const float* prob_data, const float* label, float* loss,
-          const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,float* counts);
-template void SoftmaxLossForwardGPU<double>(const int nthreads, const double* prob_data, const double* label, double* loss,
-          const int num, const int dim, const int spatial_dim,const bool has_ignore_label_, const int ignore_label_,double* counts);
-
-template <typename Dtype>
+	const Dtype* prob_data, const Dtype* label, Dtype* loss,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_,
+	Dtype* counts)
+	{
+	std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(
+		clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+	size_t Global_Work_Size[1] = { (size_t) nthreads };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossForwardGPU<float>(const int nthreads,
+	const float* prob_data, const float* label, float* loss,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_, float* counts);
+template void SoftmaxLossForwardGPU<double>(const int nthreads,
+	const double* prob_data, const double* label, double* loss,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_, double* counts);
+
+template<typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts)
-{
-    std::string kernel_name = "SoftmaxLossBackwardGPU" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int),  (void*)&nthreads));
-    OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem),  (void*)&top));
-    OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem),  (void*)&label));
-    OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem),  (void*)&bottom_diff));
-    OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int),  (void*)&num));
-    OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int),  (void*)&dim));
-    OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int),  (void*)&spatial_dim));
-    OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_bool),  (void*)&has_ignore_label_));
-    OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int),  (void*)&ignore_label_));
-    OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem),  (void*)&counts));
-
-   size_t Global_Work_Size[1] = {(size_t)nthreads};
-   size_t Local_Work_Size[1] = {256};
-   OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void SoftmaxLossBackwardGPU<float>(const int nthreads, const float* top, const float* label, float* bottom_diff, 
-                       const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, float* counts);
-template void SoftmaxLossBackwardGPU<double>(const int nthreads, const double* top, const double* label, double* bottom_diff, 
-                       const int num, const int dim, const int spatial_dim, const bool has_ignore_label_, const int ignore_label_, double* counts);
-
-template <typename Dtype>
-void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data){
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
-
-    size_t Global_Work_Size[1] = {(size_t)num};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void scal_gpu<float>(cl_kernel Kernel, const int num, const float alpha, float* data);
-template void scal_gpu<double>(cl_kernel Kernel, const int num, const double alpha, double* data);
-
-template <typename Dtype>
-void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, const Dtype* label){
-    OCL_CHECK( clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&num) );
-    OCL_CHECK( clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&dim) );
-    OCL_CHECK( clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&data) );
-    OCL_CHECK( clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&label) );
-
-    size_t Global_Work_Size[1] = {(size_t)num};
-    size_t Local_Work_Size[1] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL) );
-}
-
-template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim, float* data, const float* label);
-template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim, double* data, const double* label);
-
-template <typename Dtype>
-void max_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* top_data){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template  void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* top_data);
-template  void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* top_data);
-
-template <typename Dtype>
-void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, Dtype* top_mask){
-     std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-   
-     cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
-    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
-    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
-    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_);
-    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_);
-    ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*)&mask);
-    ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*)&top_mask);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void MaxPoolForward<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data, int* mask, float* top_mask);
-template void MaxPoolForward<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data, int* mask, double* top_mask);
-
-template <typename Dtype>
-void StoPoolForwardTrain(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* idx_data, Dtype* top_data)
-{
-    std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
-    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
-    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
-    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&idx_data);
-    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void StoPoolForwardTrain<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
-template void StoPoolForwardTrain<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
-
-template <typename Dtype>
-void StoPoolForwardTest(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, Dtype* top_data){
-    std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
-    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
-    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
-    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-
-}
-template void StoPoolForwardTest<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, float* top_data);
-template void StoPoolForwardTest<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, double* top_data);
-
-template <typename Dtype>
-void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, Dtype* top_data){
-        std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w_);
-    ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*)&stride_h_);
-    ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*)&stride_w_);
-    ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*)&pad_h_);
-    ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*)&pad_w_);
-    ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-template void AvePoolForward<float>(const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, float* top_data);
-template void AvePoolForward<double>(const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_,  const int kernel_h_, const int kernel_w_, const int stride_h_, const int stride_w_, const int pad_h_, const int pad_w_, double* top_data);
-
-template <typename Dtype> 
-void ave_pool_fp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* top_data){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
-    ret |= clSetKernelArg(Kernel, 10,sizeof(cl_int), (void*)&pad_);
-    ret |= clSetKernelArg(Kernel, 11,sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-
-template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* top_data);
-template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_,const int stride_,const int pad_, double* top_data);
-
-template <typename Dtype> 
-void max_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, Dtype* bottom_diff ){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_size_);
-    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_);
-    ret |= clSetKernelArg(Kernel,12, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-
-template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* bottom_data, const float* top_data, const float* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, float* bottom_diff);
-template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* bottom_data, const double* top_data, const double* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, double* bottom_diff );
-
-template <typename Dtype>
-void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, const int* const mask, const Dtype* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff){
-        std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&mask);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_mask);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&num);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&channels);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&height);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&width);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_height);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&pooled_width);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_h);
-    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&kernel_w);
-    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_h);
-    ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&stride_w);
-    ret |= clSetKernelArg(Kernel,14, sizeof(cl_int), (void*)&pad_h);
-    ret |= clSetKernelArg(Kernel,15, sizeof(cl_int), (void*)&pad_w);
-    ret |= clSetKernelArg(Kernel,16, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-
-template void MaxPoolBackward<float>(const int nthreads, const float* const top_diff, const int* const mask, const float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
-template void MaxPoolBackward<double>(const int nthreads, const double* const top_diff, const int* const mask, const double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
-
-template <typename Dtype>
-void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, Dtype* const bottom_diff)
-{
-    std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&num);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_h);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_w);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&stride_h);
-    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_w);
-    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&pad_h);
-    ret |= clSetKernelArg(Kernel,13, sizeof(cl_int), (void*)&pad_w);
-    ret |= clSetKernelArg(Kernel,14, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-template void AvePoolBackward<float>(const int nthreads, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, float* const bottom_diff);
-template void AvePoolBackward<double>(const int nthreads, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, double* const bottom_diff);
-
-template <typename Dtype>
-void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, Dtype* const bottom_diff){
-        std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&nthreads);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&rand_idx);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&num);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&channels);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&height);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&width);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_height);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&pooled_width);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&kernel_h);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&kernel_w);
-    ret |= clSetKernelArg(Kernel,11, sizeof(cl_int), (void*)&stride_h);
-    ret |= clSetKernelArg(Kernel,12, sizeof(cl_int), (void*)&stride_w);
-    ret |= clSetKernelArg(Kernel,13, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)nthreads};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-template void StoPoolBackward<float>(const int nthreads, const float* const rand_idx, const float* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, float* const bottom_diff);
-template void StoPoolBackward<double>(const int nthreads, const double* const rand_idx, const double* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, double* const bottom_diff);
-
-template <typename Dtype> 
-void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, const int clnum, const int channels_, const int height_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, Dtype* bottom_diff){
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&clnum);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*)&channels_);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&height_);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*)&width_);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&pooled_height_);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&pooled_width_);
-    ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*)&kernel_size_);
-    ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*)&stride_);
-    ret |= clSetKernelArg(Kernel,10, sizeof(cl_int), (void*)&pad_);
-    ret |= clSetKernelArg(Kernel,11, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[]={(size_t)count};
-    size_t uiLocal_Work_Size[]={256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue,Kernel,1,NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL));
-}
-
-template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count, const float* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, float* bottom_diff);
-template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count, const double* top_diff, const int clnum, const int channels_, const int intheight_, const int width_, const int pooled_height_, const int pooled_width_, const int kernel_size_, const int stride_, const int pad_, double* bottom_diff);
-
-
-template <typename Dtype> 
-void PReLUForward(const int count, const int channels, const int dim, const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, const int div_factor){
-    std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&slope_data);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*)&div_factor);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void PReLUForward<float>(const int count, const int channels, const int dim,const float* bottom_data, float* top_data, const float* slope_data, const int div_factor);
-template void PReLUForward<double>(const int count, const int channels, const int dim,const double* bottom_data, double* top_data, const double* slope_data, const int div_factor);
-
-template <typename Dtype> 
-void PReLUBackward(const int count, const int channels, const int dim, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, const Dtype* slope_data, const int div_factor){
-    std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*)&channels);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&dim);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff);
-    ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*)&slope_data);
-    ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*)&div_factor);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void PReLUBackward<float>(const int count, const int channels, const int dim, const float* top_diff, const float* bottom_data, float* bottom_diff, const float* slope_data, const int div_factor);
-template void PReLUBackward<double>(const int count, const int channels, const int dim, const double* top_diff, const double* bottom_data, double* bottom_diff, const double* slope_data, const int div_factor);
-
-template <typename Dtype> 
-void PReLUParamBackward(const int count, const Dtype* top_diff, const int offset_out, const Dtype* bottom_data, const int offset_in, Dtype* bottom_diff){
-    std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret  = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*)&offset_out);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_data);
-    ret  = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*)&offset_in);
-    ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*)&bottom_diff);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void PReLUParamBackward<float>(const int count, const float* top_diff, const int offset_out, const float* bottom_data, const int offset_in, float* bottom_diff);
-template void PReLUParamBackward<double>(const int count, const double* top_diff, const int offset_out, const double* bottom_data, const int offset_in, double* bottom_diff);
-
-
-template <typename Dtype> 
-void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, Dtype negative_slope){
-    std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*)&negative_slope);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void ReLUForward<float>(const int count, const float* bottom_data, float* top_data, float negative_slope);
-template void ReLUForward<double>(const int count, const double* bottom_data, double* top_data, double negative_slope);
-
-template <typename Dtype> 
-void ReLUBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope){
-    std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-  
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
-    ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*)&negative_slope);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-template void ReLUBackward<float>(const int count, const float* top_diff, const float* bottom_data, float* bottom_diff, float negative_slope);
-template void ReLUBackward<double>(const int count, const double* top_diff, const double* bottom_data, double* bottom_diff, double negative_slope);
-
-template <typename Dtype> 
-void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data){
-    std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void SigmoidForward<float>(const int count, const float* bottom_data, float* top_data);
-template void SigmoidForward<double>(const int count, const double* bottom_data, double* top_data);
-
-template <typename Dtype> 
-void SigmoidBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){
-    std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-  
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-template void SigmoidBackward<float>(const int count, const float* top_diff, const float* top_data, float* bottom_diff);
-template void SigmoidBackward<double>(const int count, const double* top_diff, const double* top_data, double* bottom_diff);
-
-template <typename Dtype> 
-void ThresholdForward(const int count, const Dtype threshold, const Dtype* bottom_data, Dtype* top_data){
-    std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&threshold);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void ThresholdForward<float>(const int count, const float threshold, const float* bottom_data, float* top_data);
-template void ThresholdForward<double>(const int count, const double threshold, const double* bottom_data, double* top_data);
-
-template <typename Dtype> 
-void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data){
-    std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&bottom_data);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void TanHForward<float>(const int count, const float* bottom_data, float* top_data);
-template void TanHForward<double>(const int count, const double* bottom_data, double* top_data);
-
-template <typename Dtype> 
-void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, Dtype* bottom_diff){
-    std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-  
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&count);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&top_diff);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)count};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-template void TanHBackward<float>(const int count, const float* top_diff, const float* top_data, float* bottom_diff);
-template void TanHBackward<double>(const int count, const double* top_diff, const double* top_data, double* bottom_diff);
-
-template <typename Dtype>
+	const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+	const int spatial_dim, const bool has_ignore_label_,
+	const int ignore_label_, Dtype* counts)
+	{
+	std::string kernel_name = "SoftmaxLossBackwardGPU"
+		+ get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff));
+	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+	OCL_CHECK(
+		clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+	size_t Global_Work_Size[1] = { (size_t) nthreads };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossBackwardGPU<float>(const int nthreads,
+	const float* top, const float* label, float* bottom_diff,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_, float* counts);
+template void SoftmaxLossBackwardGPU<double>(const int nthreads,
+	const double* top, const double* label, double* bottom_diff,
+	const int num, const int dim, const int spatial_dim,
+	const bool has_ignore_label_, const int ignore_label_, double* counts);
+
+template<typename Dtype>
+void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) {
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+
+	size_t Global_Work_Size[1] = { (size_t) num };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void scal_gpu<float>(cl_kernel Kernel, const int num,
+	const float alpha, float* data);
+template void scal_gpu<double>(cl_kernel Kernel, const int num,
+	const double alpha, double* data);
+
+template<typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data,
+	const Dtype* label) {
+	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label));
+
+	size_t Global_Work_Size[1] = { (size_t) num };
+	size_t Local_Work_Size[1] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim,
+	float* data, const float* label);
+template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim,
+	double* data, const double* label);
+
+template<typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	Dtype* top_data) {
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
+	const float* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	float* top_data);
+template void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
+	const double* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	double* top_data);
+
+template<typename Dtype>
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
+	const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+	Dtype* top_mask) {
+	std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask);
+	ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolForward<float>(const int count, const float* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, float* top_data, int* mask,
+	float* top_mask);
+template void MaxPoolForward<double>(const int count, const double* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, double* top_data, int* mask,
+	double* top_mask);
+
+template<typename Dtype>
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	Dtype* idx_data, Dtype* top_data)
+	{
+	std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolForwardTrain<float>(const int count,
+	const float* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_h_, const int kernel_w_,
+	const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+template void StoPoolForwardTrain<double>(const int count,
+	const double* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_h_, const int kernel_w_,
+	const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
+
+template<typename Dtype>
+void StoPoolForwardTest(const int count, const Dtype* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	Dtype* top_data) {
+	std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+template void StoPoolForwardTest<float>(const int count,
+	const float* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_h_, const int kernel_w_,
+	const int stride_h_, const int stride_w_, float* top_data);
+template void StoPoolForwardTest<double>(const int count,
+	const double* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_h_, const int kernel_w_,
+	const int stride_h_, const int stride_w_, double* top_data);
+
+template<typename Dtype>
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
+	const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, Dtype* top_data) {
+	std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolForward<float>(const int count, const float* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, float* top_data);
+template void AvePoolForward<double>(const int count, const double* bottom_data,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_h_,
+	const int kernel_w_, const int stride_h_, const int stride_w_,
+	const int pad_h_, const int pad_w_, double* top_data);
+
+template<typename Dtype>
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, Dtype* top_data) {
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
+	const float* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, float* top_data);
+template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
+	const double* bottom_data, const int clnum, const int channels_,
+	const int height_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, double* top_data);
+
+template<typename Dtype>
+void max_pool_bp_gpu(cl_kernel Kernel, const int count,
+	const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, Dtype* bottom_diff) {
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
+	const float* bottom_data, const float* top_data, const float* top_diff,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, float* bottom_diff);
+template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
+	const double* bottom_data, const double* top_data, const double* top_diff,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, double* bottom_diff);
+
+template<typename Dtype>
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
+	const int* const mask, const Dtype* const top_mask, const int num,
+	const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+	const int pad_w, Dtype* const bottom_diff) {
+	std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h);
+	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w);
+	ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolBackward<float>(const int nthreads,
+	const float* const top_diff, const int* const mask,
+	const float* const top_mask, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+	float* const bottom_diff);
+template void MaxPoolBackward<double>(const int nthreads,
+	const double* const top_diff, const int* const mask,
+	const double* const top_mask, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+	double* const bottom_diff);
+
+template<typename Dtype>
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
+	const int num, const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+	const int pad_w, Dtype* const bottom_diff)
+	{
+	std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w);
+	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolBackward<float>(const int nthreads,
+	const float* const top_diff, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+	float* const bottom_diff);
+template void AvePoolBackward<double>(const int nthreads,
+	const double* const top_diff, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+	double* const bottom_diff);
+
+template<typename Dtype>
+void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
+	const Dtype* const top_diff, const int num, const int channels,
+	const int height, const int width, const int pooled_height,
+	const int pooled_width, const int kernel_h, const int kernel_w,
+	const int stride_h, const int stride_w, Dtype* const bottom_diff) {
+	std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h);
+	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w);
+	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolBackward<float>(const int nthreads,
+	const float* const rand_idx, const float* const top_diff, const int num,
+	const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w,
+	float* const bottom_diff);
+template void StoPoolBackward<double>(const int nthreads,
+	const double* const rand_idx, const double* const top_diff, const int num,
+	const int channels, const int height, const int width,
+	const int pooled_height, const int pooled_width, const int kernel_h,
+	const int kernel_w, const int stride_h, const int stride_w,
+	double* const bottom_diff);
+
+template<typename Dtype>
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
+	const int clnum, const int channels_, const int height_, const int width_,
+	const int pooled_height_, const int pooled_width_, const int kernel_size_,
+	const int stride_, const int pad_, Dtype* bottom_diff) {
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
+	const float* top_diff, const int clnum, const int channels_,
+	const int intheight_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, float* bottom_diff);
+template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
+	const double* top_diff, const int clnum, const int channels_,
+	const int intheight_, const int width_, const int pooled_height_,
+	const int pooled_width_, const int kernel_size_, const int stride_,
+	const int pad_, double* bottom_diff);
+
+template<typename Dtype>
+void PReLUForward(const int count, const int channels, const int dim,
+	const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+	const int div_factor) {
+	std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUForward<float>(const int count, const int channels,
+	const int dim, const float* bottom_data, float* top_data,
+	const float* slope_data, const int div_factor);
+template void PReLUForward<double>(const int count, const int channels,
+	const int dim, const double* bottom_data, double* top_data,
+	const double* slope_data, const int div_factor);
+
+template<typename Dtype>
+void PReLUBackward(const int count, const int channels, const int dim,
+	const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+	const Dtype* slope_data, const int div_factor) {
+	std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUBackward<float>(const int count, const int channels,
+	const int dim, const float* top_diff, const float* bottom_data,
+	float* bottom_diff, const float* slope_data, const int div_factor);
+template void PReLUBackward<double>(const int count, const int channels,
+	const int dim, const double* top_diff, const double* bottom_data,
+	double* bottom_diff, const double* slope_data, const int div_factor);
+
+template<typename Dtype>
+void PReLUParamBackward(const int count, const Dtype* top_diff,
+	const int offset_out, const Dtype* bottom_data, const int offset_in,
+	Dtype* bottom_diff) {
+	std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+	ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUParamBackward<float>(const int count, const float* top_diff,
+	const int offset_out, const float* bottom_data, const int offset_in,
+	float* bottom_diff);
+template void PReLUParamBackward<double>(const int count,
+	const double* top_diff, const int offset_out, const double* bottom_data,
+	const int offset_in, double* bottom_diff);
+
+template<typename Dtype>
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
+	Dtype negative_slope) {
+	std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void ReLUForward<float>(const int count, const float* bottom_data,
+	float* top_data, float negative_slope);
+template void ReLUForward<double>(const int count, const double* bottom_data,
+	double* top_data, double negative_slope);
+
+template<typename Dtype>
+void ReLUBackward(const int count, const Dtype* top_diff,
+	const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
+	std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void ReLUBackward<float>(const int count, const float* top_diff,
+	const float* bottom_data, float* bottom_diff, float negative_slope);
+template void ReLUBackward<double>(const int count, const double* top_diff,
+	const double* bottom_data, double* bottom_diff, double negative_slope);
+
+template<typename Dtype>
+void SigmoidForward(const int count, const Dtype* bottom_data,
+	Dtype* top_data) {
+	std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SigmoidForward<float>(const int count, const float* bottom_data,
+	float* top_data);
+template void SigmoidForward<double>(const int count, const double* bottom_data,
+	double* top_data);
+
+template<typename Dtype>
+void SigmoidBackward(const int count, const Dtype* top_diff,
+	const Dtype* top_data, Dtype* bottom_diff) {
+	std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void SigmoidBackward<float>(const int count, const float* top_diff,
+	const float* top_data, float* bottom_diff);
+template void SigmoidBackward<double>(const int count, const double* top_diff,
+	const double* top_data, double* bottom_diff);
+
+template<typename Dtype>
+void ThresholdForward(const int count, const Dtype threshold,
+	const Dtype* bottom_data, Dtype* top_data) {
+	std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void ThresholdForward<float>(const int count, const float threshold,
+	const float* bottom_data, float* top_data);
+template void ThresholdForward<double>(const int count, const double threshold,
+	const double* bottom_data, double* top_data);
+
+template<typename Dtype>
+void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) {
+	std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void TanHForward<float>(const int count, const float* bottom_data,
+	float* top_data);
+template void TanHForward<double>(const int count, const double* bottom_data,
+	double* top_data);
+
+template<typename Dtype>
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
+	Dtype* bottom_diff) {
+	std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) count };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void TanHBackward<float>(const int count, const float* top_diff,
+	const float* top_data, float* bottom_diff);
+template void TanHBackward<double>(const int count, const double* top_diff,
+	const double* top_data, double* bottom_diff);
+
+template<typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
-    std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    int num_kernels = channels * height * width * optnum;
-
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
-    OCL_CHECK(ret);
-
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
-}
-
-template void opttrans<float>(const float* data_im, const int im_offset, const int channels,
-    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
-template void opttrans<double>(const double* data_im, const int im_offset, const int channels,
-    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
-
-template <typename Dtype>
-void LRNFillScale(cl_kernel LFSkernel, const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale){
-  cl_int ret;
-  ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in);
-  ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num);
-  ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels);
-  ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height);
-  ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width);
-  ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size);
-  ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k);
-  ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) );
-}
-template void LRNFillScale<float>(cl_kernel kernel, const int nthreads, const float* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const float alpha_over_size,
-    const float k, float* const scale);
-template void LRNFillScale<double>(cl_kernel kernel, const int nthreads, const double* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const double alpha_over_size,
-    const double k, double* const scale);
-
-template <typename Dtype>
+	const int height, const int width, Dtype* data_opt, const int opt_offset,
+	const int optnum) {
+	std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	int num_kernels = channels * height * width * optnum;
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
+	OCL_CHECK(ret);
+
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void opttrans<float>(const float* data_im, const int im_offset,
+	const int channels,
+	const int height, const int width, float* data_opt, const int opt_offset,
+	const int optnum);
+template void opttrans<double>(const double* data_im, const int im_offset,
+	const int channels,
+	const int height, const int width, double* data_opt, const int opt_offset,
+	const int optnum);
+
+template<typename Dtype>
+void LRNFillScale(cl_kernel LFSkernel, const int nthreads,
+	const Dtype* const in,
+	const int num, const int channels, const int height,
+	const int width, const int size, const Dtype alpha_over_size,
+	const Dtype k, Dtype* const scale) {
+	cl_int ret;
+	ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
+	ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
+	ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
+	ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
+	ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void LRNFillScale<float>(cl_kernel kernel, const int nthreads,
+	const float* const in,
+	const int num, const int channels, const int height,
+	const int width, const int size, const float alpha_over_size,
+	const float k, float* const scale);
+template void LRNFillScale<double>(cl_kernel kernel, const int nthreads,
+	const double* const in,
+	const int num, const int channels, const int height,
+	const int width, const int size, const double alpha_over_size,
+	const double k, double* const scale);
+
+template<typename Dtype>
 void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
-     Dtype* scale, Dtype negative_beta, Dtype* out){
-  cl_int ret;
-  ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
-  ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale);
-  ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta);
-  ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size2[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size2[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
-}
-template void LRNComputeOutput<float>(cl_kernel kernel, int nthreads, const float* in,
-    float* scale, float negative_beta, float* out);
-template void LRNComputeOutput<double>(cl_kernel kernel, int nthreads, const double* in,
-    double* scale, double negative_beta, double* out);
-
-template <typename Dtype>
+	Dtype* scale, Dtype negative_beta, Dtype* out) {
+	cl_int ret;
+	ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
+	ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
+	ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
+	ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size2[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
+			uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+}
+template void LRNComputeOutput<float>(cl_kernel kernel, int nthreads,
+	const float* in,
+	float* scale, float negative_beta, float* out);
+template void LRNComputeOutput<double>(cl_kernel kernel, int nthreads,
+	const double* in,
+	double* scale, double negative_beta, double* out);
+
+template<typename Dtype>
 void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff){
-  cl_int ret;
-  ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data);
-  ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data);
-  ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale);
-  ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff);
-  ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num);
-  ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels);
-  ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height);
-  ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width);
-  ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta);
-  ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio);
-  ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) );
+	const Dtype* const bottom_data, const Dtype* const top_data,
+	const Dtype* const scale, const Dtype* const top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int size, const Dtype negative_beta,
+	const Dtype cache_ratio, Dtype* const bottom_diff) {
+	cl_int ret;
+	ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
+	ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
+	ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
+	ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
+	ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
+			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void LRNComputeDiff<float>(cl_kernel kernel, const int nthreads,
-    const float* const bottom_data, const float* const top_data,
-    const float* const scale, const float* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const float negative_beta,
-    const float cache_ratio, float* const bottom_diff);
+	const float* const bottom_data, const float* const top_data,
+	const float* const scale, const float* const top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int size, const float negative_beta,
+	const float cache_ratio, float* const bottom_diff);
 template void LRNComputeDiff<double>(cl_kernel kernel, const int nthreads,
-    const double* const bottom_data, const double* const top_data,
-    const double* const scale, const double* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const double negative_beta,
-    const double cache_ratio, double* const bottom_diff);
-
-template <typename Dtype>
-void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
-    std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add<float> (const int n, const float* in1, const float* in2, float* y);
-template void caffe_gpu_add<double> (const int n, const double* in1, const double* in2, double* y);
-
-template <typename Dtype>
-void caffe_gpu_sign_ocl(const int N,  const Dtype* X, Dtype * Y ){
-    std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)N};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_sign_ocl<float>(const int N,  const float* X, float* Y );
-template void caffe_gpu_sign_ocl<double>(const int N,  const double* X, double* Y );
-
-template <typename Dtype>
-void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y ){
-    std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)N};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_abs_ocl<float>(const int N,  const float* X, float* Y );
-template void caffe_gpu_abs_ocl<double>(const int N,  const double* X, double* Y );
-
-template <typename Dtype>
-void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-    std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_div<float> (const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_div<double> (const int n, const double* a, const double* b, double* y);
-
-template <typename Dtype>
-void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
-     std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add_scalar<float> (const int n, const float alpha, float* top_data);
-template void caffe_gpu_add_scalar<double> (const int n, const double alpha, double* top_data);
-
-template <typename Dtype>
-void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-        std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_mul<float> (const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_mul<double> (const int n, const double* a, const double* b, double* y);
-
-template <typename Dtype>
-void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
-       std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_powx<float> (const int n, const float* a, const float alpha, float* y);
-template void caffe_gpu_powx<double> (const int n, const double* a, const double alpha, double* y);
-
-template <typename Dtype>
-void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
-{
-    std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
-    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
-    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
-    ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_);
-    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void DropoutForward<float>(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
-template void DropoutForward<double>(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
-
-template <typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
-{
-    std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
-    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_);
-    ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_);
-    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
-template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
-
-
-template <typename Dtype>
-void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
-{
-    std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&bottom_data);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void  BNLLForward<float>(const int count, const float* bottom_data, float *top_data);
-template void  BNLLForward<double>(const int count, const double* bottom_data, double *top_data);
-
-template <typename Dtype>
-void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff)
-{
-    std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&bottom_data);
-    ret |= clSetKernelArg(kernel,3,sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void  BNLLBackward<float>(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff);
-template void  BNLLBackward<double>(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff);
-
-
-template <typename Dtype>
-void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data)
-{
-    std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
-    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&in_data);
-    ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool),  (void*)&forward);
-    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&num_concats);
-    ret |= clSetKernelArg(kernel, 4, sizeof(cl_int),  (void*)&concat_size);
-    ret |= clSetKernelArg(kernel, 5, sizeof(cl_int),  (void*)&top_concat_axis);
-    ret |= clSetKernelArg(kernel, 6, sizeof(cl_int),  (void*)&bottom_concat_axis); 
-    ret |= clSetKernelArg(kernel, 7, sizeof(cl_int),  (void*)&offset_concat_axis);
-    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&out_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)nthreads};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void  Concat<float>(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data);
-template void  Concat<double>(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data);
-
-template <typename Dtype>
+	const double* const bottom_data, const double* const top_data,
+	const double* const scale, const double* const top_diff,
+	const int num, const int channels, const int height,
+	const int width, const int size, const double negative_beta,
+	const double cache_ratio, double* const bottom_diff);
+
+template<typename Dtype>
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
+	std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add<float>(const int n, const float* in1,
+	const float* in2, float* y);
+template void caffe_gpu_add<double>(const int n, const double* in1,
+	const double* in2, double* y);
+
+template<typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
+	std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) N };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
+	double* Y);
+
+template<typename Dtype>
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
+	std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) N };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_abs_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_abs_ocl<double>(const int N, const double* X,
+	double* Y);
+
+template<typename Dtype>
+void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+	std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_div<float>(const int n, const float* a, const float* b,
+	float* y);
+template void caffe_gpu_div<double>(const int n, const double* a,
+	const double* b, double* y);
+
+template<typename Dtype>
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) {
+	std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add_scalar<float>(const int n, const float alpha,
+	float* top_data);
+template void caffe_gpu_add_scalar<double>(const int n, const double alpha,
+	double* top_data);
+
+template<typename Dtype>
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+	std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float>(const int n, const float* a, const float* b,
+	float* y);
+template void caffe_gpu_mul<double>(const int n, const double* a,
+	const double* b, double* y);
+
+template<typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) {
+	std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float>(const int n, const float* a,
+	const float alpha, float* y);
+template void caffe_gpu_powx<double>(const int n, const double* a,
+	const double alpha, double* y);
+
+template<typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data,
+	const int* MaskMem, const Dtype scale_, Dtype* top_data)
+	{
+	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void DropoutForward<float>(const int count, const float* bottom_data,
+	const int* MaskMem, const float scale_, float* top_data);
+template void DropoutForward<double>(const int count, const double* bottom_data,
+	const int* MaskMem, const double scale_, double* top_data);
+
+template<typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
+	const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+	{
+	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void DropoutBackward<float>(const int count, const float* top_diff,
+	const int* MaskMem, const float threshold_, const float scale_,
+	float* bottom_diff);
+template void DropoutBackward<double>(const int count, const double* top_diff,
+	const int* MaskMem, const float threshold_, const double scale_,
+	double* bottom_diff);
+
+template<typename Dtype>
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
+	{
+	std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void BNLLForward<float>(const int count, const float* bottom_data,
+	float *top_data);
+template void BNLLForward<double>(const int count, const double* bottom_data,
+	double *top_data);
+
+template<typename Dtype>
+void BNLLBackward(const int count, const Dtype* top_diff,
+	const Dtype* bottom_data, Dtype *bottom_diff)
+	{
+	std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void BNLLBackward<float>(const int count, const float* top_diff,
+	const float* bottom_data, float *bottom_diff);
+template void BNLLBackward<double>(const int count, const double* top_diff,
+	const double* bottom_data, double *bottom_diff);
+
+template<typename Dtype>
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+	const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, Dtype *out_data)
+	{
+	std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*) &forward);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
+	ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
+	ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
+	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) nthreads };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Concat<float>(const int nthreads, const float* in_data,
+	const bool forward, const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, float *out_data);
+template void Concat<double>(const int nthreads, const double* in_data,
+	const bool forward, const int num_concats, const int concat_size,
+	const int top_concat_axis, const int bottom_concat_axis,
+	const int offset_concat_axis, double *out_data);
+
+template<typename Dtype>
 void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff)
-{
-    std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel, 1, sizeof(cl_int),  (void*)&channels);
-    ret |= clSetKernelArg(kernel, 2, sizeof(Dtype),   (void*)&margin);
-    ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool),  (void*)&legacy_version);
-    ret |= clSetKernelArg(kernel, 4, sizeof(Dtype),   (void*)&alpha);
-    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&y);
-    ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem),  (void*)&diff);
-    ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem),  (void*)&dist_sq);
-    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+	const Dtype margin, const bool legacy_version, const Dtype alpha,
+	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+	Dtype *bottom_diff)
+	{
+	std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version);
+	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y);
+	ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff);
+	ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq);
+	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void CLLBackward<float>(const int count, const int channels,
-    const float margin, const bool legacy_version, const float alpha,
-    const float* y, const float* diff, const float* dist_sq,
-    float *bottom_diff);
+	const float margin, const bool legacy_version, const float alpha,
+	const float* y, const float* diff, const float* dist_sq,
+	float *bottom_diff);
 template void CLLBackward<double>(const int count, const int channels,
-    const double margin, const bool legacy_version, const double alpha,
-    const double* y, const double* diff, const double* dist_sq,
-    double *bottom_diff);
+	const double margin, const bool legacy_version, const double alpha,
+	const double* y, const double* diff, const double* dist_sq,
+	double *bottom_diff);
 
-template <typename Dtype>
+template<typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask)
-{
-    std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
-    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&bottom_data_a);
-    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem),  (void*)&bottom_data_b);
-    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&blob_idx);
-    ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem),  (void*)&top_data);
-    ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem),  (void*)&mask);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)nthreads};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+	const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+	int* mask)
+	{
+	std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) nthreads };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void MaxForward<float>(const int nthreads, const float* bottom_data_a,
-    const float* bottom_data_b, const int blob_idx, float* top_data,
-    int* mask);
-template void MaxForward<double>(const int nthreads, const double* bottom_data_a,
-    const double* bottom_data_b, const int blob_idx, double* top_data,
-    int* mask);
-
-template <typename Dtype>
+	const float* bottom_data_b, const int blob_idx, float* top_data,
+	int* mask);
+template void MaxForward<double>(const int nthreads,
+	const double* bottom_data_a,
+	const double* bottom_data_b, const int blob_idx, double* top_data,
+	int* mask);
+
+template<typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff)
-{
-    std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
-    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel, 2, sizeof(cl_int),  (void*)&blob_idx);
-    ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem),  (void*)&mask);
-    ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)nthreads};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void MaxBackward<float>(const int nthreads, const float* top_diff, const int blob_idx, const int* mask, float* bottom_diff);
-template void MaxBackward<double>(const int nthreads, const double* top_diff, const int blob_idx, const int* mask, double* bottom_diff);
-
-
-template <typename Dtype>
-void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
-{
-}
-template void ocl_conv<float>(float* bottom_data, float* top_data, float* weights, float* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
-template void ocl_conv<double>(double* bottom_data, double* top_data, double* weights, double* bias, int channel_in, int width, int height, int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz);
-
+	const int blob_idx, const int* mask, Dtype* bottom_diff)
+	{
+	std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) nthreads };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void MaxBackward<float>(const int nthreads, const float* top_diff,
+	const int blob_idx, const int* mask, float* bottom_diff);
+template void MaxBackward<double>(const int nthreads, const double* top_diff,
+	const int blob_idx, const int* mask, double* bottom_diff);
+
+template<typename Dtype>
+void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
+	int channel_in, int width, int height, int channel_out, int width_out,
+	int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
+	{
+}
+template void ocl_conv<float>(float* bottom_data, float* top_data,
+	float* weights, float* bias, int channel_in, int width, int height,
+	int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+	int stride, int pad, int batch_sz);
+template void ocl_conv<double>(double* bottom_data, double* top_data,
+	double* weights, double* bias, int channel_in, int width, int height,
+	int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+	int stride, int pad, int batch_sz);
 
 }  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 38a06026..f4373901 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -13,540 +13,564 @@
 namespace caffe {
 
 bool NetNeedsUpgrade(const NetParameter& net_param) {
-  return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param);
+	return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param);
 }
 
 bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param) {
-  for (int i = 0; i < net_param.layers_size(); ++i) {
-    if (net_param.layers(i).has_layer()) {
-      return true;
-    }
-  }
-  return false;
+	for (int i = 0; i < net_param.layers_size(); ++i) {
+		if (net_param.layers(i).has_layer()) {
+			return true;
+		}
+	}
+	return false;
 }
 
 bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) {
-  return net_param.layers_size() > 0;
+	return net_param.layers_size() > 0;
 }
 
 bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
-                  NetParameter* net_param) {
-  // First upgrade padding layers to padded conv layers.
-  NetParameter v0_net_param;
-  UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
-  // Now upgrade layer parameters.
-  bool is_fully_compatible = true;
-  net_param->Clear();
-  if (v0_net_param.has_name()) {
-    net_param->set_name(v0_net_param.name());
-  }
-  for (int i = 0; i < v0_net_param.layers_size(); ++i) {
-    is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
-                                                   net_param->add_layers());
-  }
-  for (int i = 0; i < v0_net_param.input_size(); ++i) {
-    net_param->add_input(v0_net_param.input(i));
-  }
-  for (int i = 0; i < v0_net_param.input_dim_size(); ++i) {
-    net_param->add_input_dim(v0_net_param.input_dim(i));
-  }
-  if (v0_net_param.has_force_backward()) {
-    net_param->set_force_backward(v0_net_param.force_backward());
-  }
-  return is_fully_compatible;
+	NetParameter* net_param) {
+	// First upgrade padding layers to padded conv layers.
+	NetParameter v0_net_param;
+	UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
+	// Now upgrade layer parameters.
+	bool is_fully_compatible = true;
+	net_param->Clear();
+	if (v0_net_param.has_name()) {
+		net_param->set_name(v0_net_param.name());
+	}
+	for (int i = 0; i < v0_net_param.layers_size(); ++i) {
+		is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
+			net_param->add_layers());
+	}
+	for (int i = 0; i < v0_net_param.input_size(); ++i) {
+		net_param->add_input(v0_net_param.input(i));
+	}
+	for (int i = 0; i < v0_net_param.input_dim_size(); ++i) {
+		net_param->add_input_dim(v0_net_param.input_dim(i));
+	}
+	if (v0_net_param.has_force_backward()) {
+		net_param->set_force_backward(v0_net_param.force_backward());
+	}
+	return is_fully_compatible;
 }
 
 void UpgradeV0PaddingLayers(const NetParameter& param,
-                            NetParameter* param_upgraded_pad) {
-  // Copy everything other than the layers from the original param.
-  param_upgraded_pad->Clear();
-  param_upgraded_pad->CopyFrom(param);
-  param_upgraded_pad->clear_layers();
-  // Figure out which layer each bottom blob comes from.
-  map<string, int> blob_name_to_last_top_idx;
-  for (int i = 0; i < param.input_size(); ++i) {
-    const string& blob_name = param.input(i);
-    blob_name_to_last_top_idx[blob_name] = -1;
-  }
-  for (int i = 0; i < param.layers_size(); ++i) {
-    const V1LayerParameter& layer_connection = param.layers(i);
-    const V0LayerParameter& layer_param = layer_connection.layer();
-    // Add the layer to the new net, unless it's a padding layer.
-    if (layer_param.type() != "padding") {
-      param_upgraded_pad->add_layers()->CopyFrom(layer_connection);
-    }
-    for (int j = 0; j < layer_connection.bottom_size(); ++j) {
-      const string& blob_name = layer_connection.bottom(j);
-      if (blob_name_to_last_top_idx.find(blob_name) ==
-          blob_name_to_last_top_idx.end()) {
-        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
-      }
-      const int top_idx = blob_name_to_last_top_idx[blob_name];
-      if (top_idx == -1) {
-        continue;
-      }
-      const V1LayerParameter& source_layer = param.layers(top_idx);
-      if (source_layer.layer().type() == "padding") {
-        // This layer has a padding layer as input -- check that it is a conv
-        // layer or a pooling layer and takes only one input.  Also check that
-        // the padding layer input has only one input and one output.  Other
-        // cases have undefined behavior in Caffe.
-        CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
-            << "Padding layer input to "
-            "non-convolutional / non-pooling layer type "
-            << layer_param.type();
-        CHECK_EQ(layer_connection.bottom_size(), 1)
-            << "Conv Layer takes a single blob as input.";
-        CHECK_EQ(source_layer.bottom_size(), 1)
-            << "Padding Layer takes a single blob as input.";
-        CHECK_EQ(source_layer.top_size(), 1)
-            << "Padding Layer produces a single blob as output.";
-        int layer_index = param_upgraded_pad->layers_size() - 1;
-        param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()
-            ->set_pad(source_layer.layer().pad());
-        param_upgraded_pad->mutable_layers(layer_index)
-            ->set_bottom(j, source_layer.bottom(0));
-      }
-    }
-    for (int j = 0; j < layer_connection.top_size(); ++j) {
-      const string& blob_name = layer_connection.top(j);
-      blob_name_to_last_top_idx[blob_name] = i;
-    }
-  }
+	NetParameter* param_upgraded_pad) {
+	// Copy everything other than the layers from the original param.
+	param_upgraded_pad->Clear();
+	param_upgraded_pad->CopyFrom(param);
+	param_upgraded_pad->clear_layers();
+	// Figure out which layer each bottom blob comes from.
+	map<string, int> blob_name_to_last_top_idx;
+	for (int i = 0; i < param.input_size(); ++i) {
+		const string& blob_name = param.input(i);
+		blob_name_to_last_top_idx[blob_name] = -1;
+	}
+	for (int i = 0; i < param.layers_size(); ++i) {
+		const V1LayerParameter& layer_connection = param.layers(i);
+		const V0LayerParameter& layer_param = layer_connection.layer();
+		// Add the layer to the new net, unless it's a padding layer.
+		if (layer_param.type() != "padding") {
+			param_upgraded_pad->add_layers()->CopyFrom(layer_connection);
+		}
+		for (int j = 0; j < layer_connection.bottom_size(); ++j) {
+			const string& blob_name = layer_connection.bottom(j);
+			if (blob_name_to_last_top_idx.find(blob_name) ==
+				blob_name_to_last_top_idx.end()) {
+				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+			}
+			const int top_idx = blob_name_to_last_top_idx[blob_name];
+			if (top_idx == -1) {
+				continue;
+			}
+			const V1LayerParameter& source_layer = param.layers(top_idx);
+			if (source_layer.layer().type() == "padding") {
+				// This layer has a padding layer as input -- check that it is a conv
+				// layer or a pooling layer and takes only one input.  Also check that
+				// the padding layer input has only one input and one output.  Other
+				// cases have undefined behavior in Caffe.
+				CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
+					<< "Padding layer input to "
+						"non-convolutional / non-pooling layer type "
+					<< layer_param.type();
+				CHECK_EQ(layer_connection.bottom_size(), 1)
+					<< "Conv Layer takes a single blob as input.";
+				CHECK_EQ(source_layer.bottom_size(), 1)
+					<< "Padding Layer takes a single blob as input.";
+				CHECK_EQ(source_layer.top_size(), 1)
+					<< "Padding Layer produces a single blob as output.";
+				int layer_index = param_upgraded_pad->layers_size() - 1;
+				param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()
+					->set_pad(source_layer.layer().pad());
+				param_upgraded_pad->mutable_layers(layer_index)
+					->set_bottom(j, source_layer.bottom(0));
+			}
+		}
+		for (int j = 0; j < layer_connection.top_size(); ++j) {
+			const string& blob_name = layer_connection.top(j);
+			blob_name_to_last_top_idx[blob_name] = i;
+		}
+	}
 }
 
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-                             V1LayerParameter* layer_param) {
-  bool is_fully_compatible = true;
-  layer_param->Clear();
-  for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
-    layer_param->add_bottom(v0_layer_connection.bottom(i));
-  }
-  for (int i = 0; i < v0_layer_connection.top_size(); ++i) {
-    layer_param->add_top(v0_layer_connection.top(i));
-  }
-  if (v0_layer_connection.has_layer()) {
-    const V0LayerParameter& v0_layer_param = v0_layer_connection.layer();
-    if (v0_layer_param.has_name()) {
-      layer_param->set_name(v0_layer_param.name());
-    }
-    const string& type = v0_layer_param.type();
-    if (v0_layer_param.has_type()) {
-      layer_param->set_type(UpgradeV0LayerType(type));
-    }
-    for (int i = 0; i < v0_layer_param.blobs_size(); ++i) {
-      layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i));
-    }
-    for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) {
-      layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i));
-    }
-    for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) {
-      layer_param->add_weight_decay(v0_layer_param.weight_decay(i));
-    }
-    if (v0_layer_param.has_num_output()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_num_output(
-            v0_layer_param.num_output());
-      } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->set_num_output(
-            v0_layer_param.num_output());
-      } else {
-        LOG(ERROR) << "Unknown parameter num_output for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_biasterm()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_bias_term(
-            v0_layer_param.biasterm());
-      } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->set_bias_term(
-            v0_layer_param.biasterm());
-      } else {
-        LOG(ERROR) << "Unknown parameter biasterm for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_weight_filler()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->
-            mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
-      } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->
-            mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
-      } else {
-        LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_bias_filler()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->
-            mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
-      } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->
-            mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
-      } else {
-        LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_pad()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
-      } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
-      } else {
-        LOG(ERROR) << "Unknown parameter pad for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_kernelsize()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_kernel_size(
-            v0_layer_param.kernelsize());
-      } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_kernel_size(
-            v0_layer_param.kernelsize());
-      } else {
-        LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_group()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_group(
-            v0_layer_param.group());
-      } else {
-        LOG(ERROR) << "Unknown parameter group for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_stride()) {
-      if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_stride(
-            v0_layer_param.stride());
-      } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_stride(
-            v0_layer_param.stride());
-      } else {
-        LOG(ERROR) << "Unknown parameter stride for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_pool()) {
-      if (type == "pool") {
-        V0LayerParameter_PoolMethod pool = v0_layer_param.pool();
-        switch (pool) {
-        case V0LayerParameter_PoolMethod_MAX:
-          layer_param->mutable_pooling_param()->set_pool(
-              PoolingParameter_PoolMethod_MAX);
-          break;
-        case V0LayerParameter_PoolMethod_AVE:
-          layer_param->mutable_pooling_param()->set_pool(
-              PoolingParameter_PoolMethod_AVE);
-          break;
-        case V0LayerParameter_PoolMethod_STOCHASTIC:
-          layer_param->mutable_pooling_param()->set_pool(
-              PoolingParameter_PoolMethod_STOCHASTIC);
-          break;
-        default:
-          LOG(ERROR) << "Unknown pool method " << pool;
-          is_fully_compatible = false;
-        }
-      } else {
-        LOG(ERROR) << "Unknown parameter pool for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_dropout_ratio()) {
-      if (type == "dropout") {
-        layer_param->mutable_dropout_param()->set_dropout_ratio(
-            v0_layer_param.dropout_ratio());
-      } else {
-        LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_local_size()) {
-      if (type == "lrn") {
-        layer_param->mutable_lrn_param()->set_local_size(
-            v0_layer_param.local_size());
-      } else {
-        LOG(ERROR) << "Unknown parameter local_size for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_alpha()) {
-      if (type == "lrn") {
-        layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha());
-      } else {
-        LOG(ERROR) << "Unknown parameter alpha for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_beta()) {
-      if (type == "lrn") {
-        layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta());
-      } else {
-        LOG(ERROR) << "Unknown parameter beta for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_k()) {
-      if (type == "lrn") {
-        layer_param->mutable_lrn_param()->set_k(v0_layer_param.k());
-      } else {
-        LOG(ERROR) << "Unknown parameter k for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_source()) {
-      if (type == "data") {
-        layer_param->mutable_data_param()->set_source(v0_layer_param.source());
-      } else if (type == "hdf5_data") {
-        layer_param->mutable_hdf5_data_param()->set_source(
-            v0_layer_param.source());
-      } else if (type == "images") {
-        layer_param->mutable_image_data_param()->set_source(
-            v0_layer_param.source());
-      } else if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_source(
-            v0_layer_param.source());
-      } else if (type == "infogain_loss") {
-        layer_param->mutable_infogain_loss_param()->set_source(
-            v0_layer_param.source());
-      } else {
-        LOG(ERROR) << "Unknown parameter source for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_scale()) {
-      layer_param->mutable_transform_param()->
-          set_scale(v0_layer_param.scale());
-    }
-    if (v0_layer_param.has_meanfile()) {
-      layer_param->mutable_transform_param()->
-          set_mean_file(v0_layer_param.meanfile());
-    }
-    if (v0_layer_param.has_batchsize()) {
-      if (type == "data") {
-        layer_param->mutable_data_param()->set_batch_size(
-            v0_layer_param.batchsize());
-      } else if (type == "hdf5_data") {
-        layer_param->mutable_hdf5_data_param()->set_batch_size(
-            v0_layer_param.batchsize());
-      } else if (type == "images") {
-        layer_param->mutable_image_data_param()->set_batch_size(
-            v0_layer_param.batchsize());
-      } else if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_batch_size(
-            v0_layer_param.batchsize());
-      } else {
-        LOG(ERROR) << "Unknown parameter batchsize for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_cropsize()) {
-      layer_param->mutable_transform_param()->
-          set_crop_size(v0_layer_param.cropsize());
-    }
-    if (v0_layer_param.has_mirror()) {
-      layer_param->mutable_transform_param()->
-          set_mirror(v0_layer_param.mirror());
-    }
-    if (v0_layer_param.has_rand_skip()) {
-      if (type == "data") {
-        layer_param->mutable_data_param()->set_rand_skip(
-            v0_layer_param.rand_skip());
-      } else if (type == "images") {
-        layer_param->mutable_image_data_param()->set_rand_skip(
-            v0_layer_param.rand_skip());
-      } else {
-        LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_shuffle_images()) {
-      if (type == "images") {
-        layer_param->mutable_image_data_param()->set_shuffle(
-            v0_layer_param.shuffle_images());
-      } else {
-        LOG(ERROR) << "Unknown parameter shuffle for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_new_height()) {
-      if (type == "images") {
-        layer_param->mutable_image_data_param()->set_new_height(
-            v0_layer_param.new_height());
-      } else {
-        LOG(ERROR) << "Unknown parameter new_height for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_new_width()) {
-      if (type == "images") {
-        layer_param->mutable_image_data_param()->set_new_width(
-            v0_layer_param.new_width());
-      } else {
-        LOG(ERROR) << "Unknown parameter new_width for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_concat_dim()) {
-      if (type == "concat") {
-        layer_param->mutable_concat_param()->set_concat_dim(
-            v0_layer_param.concat_dim());
-      } else {
-        LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_det_fg_threshold()) {
-      if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_fg_threshold(
-            v0_layer_param.det_fg_threshold());
-      } else {
-        LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_det_bg_threshold()) {
-      if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_bg_threshold(
-            v0_layer_param.det_bg_threshold());
-      } else {
-        LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_det_fg_fraction()) {
-      if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_fg_fraction(
-            v0_layer_param.det_fg_fraction());
-      } else {
-        LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_det_context_pad()) {
-      if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_context_pad(
-            v0_layer_param.det_context_pad());
-      } else {
-        LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_det_crop_mode()) {
-      if (type == "window_data") {
-        layer_param->mutable_window_data_param()->set_crop_mode(
-            v0_layer_param.det_crop_mode());
-      } else {
-        LOG(ERROR) << "Unknown parameter det_crop_mode for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-    if (v0_layer_param.has_hdf5_output_param()) {
-      if (type == "hdf5_output") {
-        layer_param->mutable_hdf5_output_param()->CopyFrom(
-            v0_layer_param.hdf5_output_param());
-      } else {
-        LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
-                   << type;
-        is_fully_compatible = false;
-      }
-    }
-  }
-  return is_fully_compatible;
+	V1LayerParameter* layer_param) {
+	bool is_fully_compatible = true;
+	layer_param->Clear();
+	for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
+		layer_param->add_bottom(v0_layer_connection.bottom(i));
+	}
+	for (int i = 0; i < v0_layer_connection.top_size(); ++i) {
+		layer_param->add_top(v0_layer_connection.top(i));
+	}
+	if (v0_layer_connection.has_layer()) {
+		const V0LayerParameter& v0_layer_param = v0_layer_connection.layer();
+		if (v0_layer_param.has_name()) {
+			layer_param->set_name(v0_layer_param.name());
+		}
+		const string& type = v0_layer_param.type();
+		if (v0_layer_param.has_type()) {
+			layer_param->set_type(UpgradeV0LayerType(type));
+		}
+		for (int i = 0; i < v0_layer_param.blobs_size(); ++i) {
+			layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i));
+		}
+		for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) {
+			layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i));
+		}
+		for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) {
+			layer_param->add_weight_decay(v0_layer_param.weight_decay(i));
+		}
+		if (v0_layer_param.has_num_output()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_num_output(
+					v0_layer_param.num_output());
+			} else if (type == "innerproduct") {
+				layer_param->mutable_inner_product_param()->set_num_output(
+					v0_layer_param.num_output());
+			} else {
+				LOG(ERROR) << "Unknown parameter num_output for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_biasterm()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_bias_term(
+					v0_layer_param.biasterm());
+			} else if (type == "innerproduct") {
+				layer_param->mutable_inner_product_param()->set_bias_term(
+					v0_layer_param.biasterm());
+			} else {
+				LOG(ERROR) << "Unknown parameter biasterm for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_weight_filler()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->
+					mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+			} else if (type == "innerproduct") {
+				layer_param->mutable_inner_product_param()->
+					mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+			} else {
+				LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_bias_filler()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->
+					mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+			} else if (type == "innerproduct") {
+				layer_param->mutable_inner_product_param()->
+					mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+			} else {
+				LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_pad()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
+			} else if (type == "pool") {
+				layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
+			} else {
+				LOG(ERROR) << "Unknown parameter pad for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_kernelsize()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_kernel_size(
+					v0_layer_param.kernelsize());
+			} else if (type == "pool") {
+				layer_param->mutable_pooling_param()->set_kernel_size(
+					v0_layer_param.kernelsize());
+			} else {
+				LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_group()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_group(
+					v0_layer_param.group());
+			} else {
+				LOG(ERROR) << "Unknown parameter group for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_stride()) {
+			if (type == "conv") {
+				layer_param->mutable_convolution_param()->set_stride(
+					v0_layer_param.stride());
+			} else if (type == "pool") {
+				layer_param->mutable_pooling_param()->set_stride(
+					v0_layer_param.stride());
+			} else {
+				LOG(ERROR) << "Unknown parameter stride for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_pool()) {
+			if (type == "pool") {
+				V0LayerParameter_PoolMethod pool = v0_layer_param.pool();
+				switch (pool) {
+					case V0LayerParameter_PoolMethod_MAX:
+						layer_param->mutable_pooling_param()->set_pool(
+							PoolingParameter_PoolMethod_MAX);
+						break;
+					case V0LayerParameter_PoolMethod_AVE:
+						layer_param->mutable_pooling_param()->set_pool(
+							PoolingParameter_PoolMethod_AVE);
+						break;
+					case V0LayerParameter_PoolMethod_STOCHASTIC:
+						layer_param->mutable_pooling_param()->set_pool(
+							PoolingParameter_PoolMethod_STOCHASTIC);
+						break;
+					default:
+						LOG(ERROR) << "Unknown pool method " << pool;
+						is_fully_compatible = false;
+				}
+			} else {
+				LOG(ERROR) << "Unknown parameter pool for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_dropout_ratio()) {
+			if (type == "dropout") {
+				layer_param->mutable_dropout_param()->set_dropout_ratio(
+					v0_layer_param.dropout_ratio());
+			} else {
+				LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_local_size()) {
+			if (type == "lrn") {
+				layer_param->mutable_lrn_param()->set_local_size(
+					v0_layer_param.local_size());
+			} else {
+				LOG(ERROR) << "Unknown parameter local_size for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_alpha()) {
+			if (type == "lrn") {
+				layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha());
+			} else {
+				LOG(ERROR) << "Unknown parameter alpha for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_beta()) {
+			if (type == "lrn") {
+				layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta());
+			} else {
+				LOG(ERROR) << "Unknown parameter beta for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_k()) {
+			if (type == "lrn") {
+				layer_param->mutable_lrn_param()->set_k(v0_layer_param.k());
+			} else {
+				LOG(ERROR) << "Unknown parameter k for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_source()) {
+			if (type == "data") {
+				layer_param->mutable_data_param()->set_source(v0_layer_param.source());
+			} else if (type == "hdf5_data") {
+				layer_param->mutable_hdf5_data_param()->set_source(
+					v0_layer_param.source());
+			} else if (type == "images") {
+				layer_param->mutable_image_data_param()->set_source(
+					v0_layer_param.source());
+			} else if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_source(
+					v0_layer_param.source());
+			} else if (type == "infogain_loss") {
+				layer_param->mutable_infogain_loss_param()->set_source(
+					v0_layer_param.source());
+			} else {
+				LOG(ERROR) << "Unknown parameter source for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_scale()) {
+			layer_param->mutable_transform_param()->
+				set_scale(v0_layer_param.scale());
+		}
+		if (v0_layer_param.has_meanfile()) {
+			layer_param->mutable_transform_param()->
+				set_mean_file(v0_layer_param.meanfile());
+		}
+		if (v0_layer_param.has_batchsize()) {
+			if (type == "data") {
+				layer_param->mutable_data_param()->set_batch_size(
+					v0_layer_param.batchsize());
+			} else if (type == "hdf5_data") {
+				layer_param->mutable_hdf5_data_param()->set_batch_size(
+					v0_layer_param.batchsize());
+			} else if (type == "images") {
+				layer_param->mutable_image_data_param()->set_batch_size(
+					v0_layer_param.batchsize());
+			} else if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_batch_size(
+					v0_layer_param.batchsize());
+			} else {
+				LOG(ERROR) << "Unknown parameter batchsize for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_cropsize()) {
+			layer_param->mutable_transform_param()->
+				set_crop_size(v0_layer_param.cropsize());
+		}
+		if (v0_layer_param.has_mirror()) {
+			layer_param->mutable_transform_param()->
+				set_mirror(v0_layer_param.mirror());
+		}
+		if (v0_layer_param.has_rand_skip()) {
+			if (type == "data") {
+				layer_param->mutable_data_param()->set_rand_skip(
+					v0_layer_param.rand_skip());
+			} else if (type == "images") {
+				layer_param->mutable_image_data_param()->set_rand_skip(
+					v0_layer_param.rand_skip());
+			} else {
+				LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_shuffle_images()) {
+			if (type == "images") {
+				layer_param->mutable_image_data_param()->set_shuffle(
+					v0_layer_param.shuffle_images());
+			} else {
+				LOG(ERROR) << "Unknown parameter shuffle for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_new_height()) {
+			if (type == "images") {
+				layer_param->mutable_image_data_param()->set_new_height(
+					v0_layer_param.new_height());
+			} else {
+				LOG(ERROR) << "Unknown parameter new_height for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_new_width()) {
+			if (type == "images") {
+				layer_param->mutable_image_data_param()->set_new_width(
+					v0_layer_param.new_width());
+			} else {
+				LOG(ERROR) << "Unknown parameter new_width for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_concat_dim()) {
+			if (type == "concat") {
+				layer_param->mutable_concat_param()->set_concat_dim(
+					v0_layer_param.concat_dim());
+			} else {
+				LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_det_fg_threshold()) {
+			if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_fg_threshold(
+					v0_layer_param.det_fg_threshold());
+			} else {
+				LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_det_bg_threshold()) {
+			if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_bg_threshold(
+					v0_layer_param.det_bg_threshold());
+			} else {
+				LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_det_fg_fraction()) {
+			if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_fg_fraction(
+					v0_layer_param.det_fg_fraction());
+			} else {
+				LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_det_context_pad()) {
+			if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_context_pad(
+					v0_layer_param.det_context_pad());
+			} else {
+				LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_det_crop_mode()) {
+			if (type == "window_data") {
+				layer_param->mutable_window_data_param()->set_crop_mode(
+					v0_layer_param.det_crop_mode());
+			} else {
+				LOG(ERROR) << "Unknown parameter det_crop_mode for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+		if (v0_layer_param.has_hdf5_output_param()) {
+			if (type == "hdf5_output") {
+				layer_param->mutable_hdf5_output_param()->CopyFrom(
+					v0_layer_param.hdf5_output_param());
+			} else {
+				LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
+					<< type;
+				is_fully_compatible = false;
+			}
+		}
+	}
+	return is_fully_compatible;
 }
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type) {
-  if (type == "accuracy") {
-    return V1LayerParameter_LayerType_ACCURACY;
-  } else if (type == "bnll") {
-    return V1LayerParameter_LayerType_BNLL;
-  } else if (type == "concat") {
-    return V1LayerParameter_LayerType_CONCAT;
-  } else if (type == "conv") {
-    return V1LayerParameter_LayerType_CONVOLUTION;
-  } else if (type == "data") {
-    return V1LayerParameter_LayerType_DATA;
-  } else if (type == "dropout") {
-    return V1LayerParameter_LayerType_DROPOUT;
-  } else if (type == "euclidean_loss") {
-    return V1LayerParameter_LayerType_EUCLIDEAN_LOSS;
-  } else if (type == "flatten") {
-    return V1LayerParameter_LayerType_FLATTEN;
-  } else if (type == "hdf5_data") {
-    return V1LayerParameter_LayerType_HDF5_DATA;
-  } else if (type == "hdf5_output") {
-    return V1LayerParameter_LayerType_HDF5_OUTPUT;
-  } else if (type == "im2col") {
-    return V1LayerParameter_LayerType_IM2COL;
-  } else if (type == "images") {
-    return V1LayerParameter_LayerType_IMAGE_DATA;
-  } else if (type == "infogain_loss") {
-    return V1LayerParameter_LayerType_INFOGAIN_LOSS;
-  } else if (type == "innerproduct") {
-    return V1LayerParameter_LayerType_INNER_PRODUCT;
-  } else if (type == "lrn") {
-    return V1LayerParameter_LayerType_LRN;
-  } else if (type == "multinomial_logistic_loss") {
-    return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS;
-  } else if (type == "pool") {
-    return V1LayerParameter_LayerType_POOLING;
-  } else if (type == "relu") {
-    return V1LayerParameter_LayerType_RELU;
-  } else if (type == "sigmoid") {
-    return V1LayerParameter_LayerType_SIGMOID;
-  } else if (type == "softmax") {
-    return V1LayerParameter_LayerType_SOFTMAX;
-  } else if (type == "softmax_loss") {
-    return V1LayerParameter_LayerType_SOFTMAX_LOSS;
-  } else if (type == "split") {
-    return V1LayerParameter_LayerType_SPLIT;
-  } else if (type == "tanh") {
-    return V1LayerParameter_LayerType_TANH;
-  } else if (type == "window_data") {
-    return V1LayerParameter_LayerType_WINDOW_DATA;
-  } else {
-    LOG(FATAL) << "Unknown layer name: " << type;
-    return V1LayerParameter_LayerType_NONE;
-  }
+	if (type == "accuracy") {
+		return V1LayerParameter_LayerType_ACCURACY;
+	} else if (type == "bnll") {
+		return V1LayerParameter_LayerType_BNLL;
+	} else if (type == "concat") {
+		return V1LayerParameter_LayerType_CONCAT;
+	} else if (type == "conv") {
+		return V1LayerParameter_LayerType_CONVOLUTION;
+	} else if (type == "data") {
+		return V1LayerParameter_LayerType_DATA;
+	} else if (type == "dropout") {
+		return V1LayerParameter_LayerType_DROPOUT;
+	} else if (type == "euclidean_loss") {
+		return V1LayerParameter_LayerType_EUCLIDEAN_LOSS;
+	} else if (type == "flatten") {
+		return V1LayerParameter_LayerType_FLATTEN;
+	} else if (type == "hdf5_data") {
+		return V1LayerParameter_LayerType_HDF5_DATA;
+	} else if (type == "hdf5_output") {
+		return V1LayerParameter_LayerType_HDF5_OUTPUT;
+	} else if (type == "im2col") {
+		return V1LayerParameter_LayerType_IM2COL;
+	} else if (type == "images") {
+		return V1LayerParameter_LayerType_IMAGE_DATA;
+	} else if (type == "infogain_loss") {
+		return V1LayerParameter_LayerType_INFOGAIN_LOSS;
+	} else if (type == "innerproduct") {
+		return V1LayerParameter_LayerType_INNER_PRODUCT;
+	} else if (type == "lrn") {
+		return V1LayerParameter_LayerType_LRN;
+	} else if (type == "multinomial_logistic_loss") {
+		return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS;
+	} else if (type == "pool") {
+		return V1LayerParameter_LayerType_POOLING;
+	} else if (type == "relu") {
+		return V1LayerParameter_LayerType_RELU;
+	} else if (type == "sigmoid") {
+		return V1LayerParameter_LayerType_SIGMOID;
+	} else if (type == "softmax") {
+		return V1LayerParameter_LayerType_SOFTMAX;
+	} else if (type == "softmax_loss") {
+		return V1LayerParameter_LayerType_SOFTMAX_LOSS;
+	} else if (type == "split") {
+		return V1LayerParameter_LayerType_SPLIT;
+	} else if (type == "tanh") {
+		return V1LayerParameter_LayerType_TANH;
+	} else if (type == "window_data") {
+		return V1LayerParameter_LayerType_WINDOW_DATA;
+	} else {
+		LOG(FATAL) << "Unknown layer name: " << type;
+		return V1LayerParameter_LayerType_NONE;
+	}
 }
 
 bool NetNeedsDataUpgrade(const NetParameter& net_param) {
-  for (int i = 0; i < net_param.layers_size(); ++i) {
-    if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) {
-      DataParameter layer_param = net_param.layers(i).data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
-    }
-    if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) {
-      ImageDataParameter layer_param = net_param.layers(i).image_data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
-    }
-    if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) {
-      WindowDataParameter layer_param = net_param.layers(i).window_data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
-    }
-  }
-  return false;
+	for (int i = 0; i < net_param.layers_size(); ++i) {
+		if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) {
+			DataParameter layer_param = net_param.layers(i).data_param();
+			if (layer_param.has_scale()) {
+				return true;
+			}
+			if (layer_param.has_mean_file()) {
+				return true;
+			}
+			if (layer_param.has_crop_size()) {
+				return true;
+			}
+			if (layer_param.has_mirror()) {
+				return true;
+			}
+		}
+		if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) {
+			ImageDataParameter layer_param = net_param.layers(i).image_data_param();
+			if (layer_param.has_scale()) {
+				return true;
+			}
+			if (layer_param.has_mean_file()) {
+				return true;
+			}
+			if (layer_param.has_crop_size()) {
+				return true;
+			}
+			if (layer_param.has_mirror()) {
+				return true;
+			}
+		}
+		if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) {
+			WindowDataParameter layer_param = net_param.layers(i).window_data_param();
+			if (layer_param.has_scale()) {
+				return true;
+			}
+			if (layer_param.has_mean_file()) {
+				return true;
+			}
+			if (layer_param.has_crop_size()) {
+				return true;
+			}
+			if (layer_param.has_mirror()) {
+				return true;
+			}
+		}
+	}
+	return false;
 }
 
 #define CONVERT_LAYER_TRANSFORM_PARAM(TYPE, Name, param_name) \
@@ -576,365 +600,373 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) {
   } while (0)
 
 void UpgradeNetDataTransformation(NetParameter* net_param) {
-  for (int i = 0; i < net_param->layers_size(); ++i) {
-    CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data);
-    CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data);
-    CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data);
-  }
+	for (int i = 0; i < net_param->layers_size(); ++i) {
+		CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data);
+		CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data);
+		CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data);
+	}
 }
 
 bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
-  bool success = true;
-  if (NetNeedsV0ToV1Upgrade(*param)) {
-    // NetParameter was specified using the old style (V0LayerParameter); try to
-    // upgrade it.
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V0LayerParameter: " << param_file;
-    NetParameter original_param(*param);
-    if (!UpgradeV0Net(original_param, param)) {
-      success = false;
-      LOG(ERROR) << "Warning: had one or more problems upgrading "
-          << "V0NetParameter to NetParameter (see above); continuing anyway.";
-    } else {
-      LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V0LayerParameter";
-    }
-    LOG(ERROR) << "Note that future Caffe releases will not support "
-        << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
-        << "prototxt and ./build/tools/upgrade_net_proto_binary for model "
-        << "weights upgrade this and any other net protos to the new format.";
-  }
-  // NetParameter uses old style data transformation fields; try to upgrade it.
-  if (NetNeedsDataUpgrade(*param)) {
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "transformation parameters: " << param_file;
-    UpgradeNetDataTransformation(param);
-    LOG(INFO) << "Successfully upgraded file specified using deprecated "
-              << "data transformation parameters.";
-    LOG(ERROR) << "Note that future Caffe releases will only support "
-               << "transform_param messages for transformation fields.";
-  }
-  if (NetNeedsV1ToV2Upgrade(*param)) {
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V1LayerParameter: " << param_file;
-    NetParameter original_param(*param);
-    if (!UpgradeV1Net(original_param, param)) {
-      success = false;
-      LOG(ERROR) << "Warning: had one or more problems upgrading "
-          << "V1LayerParameter (see above); continuing anyway.";
-    } else {
-      LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V1LayerParameter";
-    }
-  }
-  return success;
+	bool success = true;
+	if (NetNeedsV0ToV1Upgrade(*param)) {
+		// NetParameter was specified using the old style (V0LayerParameter); try to
+		// upgrade it.
+		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+			<< "V0LayerParameter: " << param_file;
+		NetParameter original_param(*param);
+		if (!UpgradeV0Net(original_param, param)) {
+			success = false;
+			LOG(ERROR) << "Warning: had one or more problems upgrading "
+				<< "V0NetParameter to NetParameter (see above); continuing anyway.";
+		} else {
+			LOG(INFO) << "Successfully upgraded file specified using deprecated "
+				<< "V0LayerParameter";
+		}
+		LOG(ERROR) << "Note that future Caffe releases will not support "
+			<< "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
+			<< "prototxt and ./build/tools/upgrade_net_proto_binary for model "
+			<< "weights upgrade this and any other net protos to the new format.";
+	}
+	// NetParameter uses old style data transformation fields; try to upgrade it.
+	if (NetNeedsDataUpgrade(*param)) {
+		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+			<< "transformation parameters: " << param_file;
+		UpgradeNetDataTransformation(param);
+		LOG(INFO) << "Successfully upgraded file specified using deprecated "
+			<< "data transformation parameters.";
+		LOG(ERROR) << "Note that future Caffe releases will only support "
+			<< "transform_param messages for transformation fields.";
+	}
+	if (NetNeedsV1ToV2Upgrade(*param)) {
+		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+			<< "V1LayerParameter: " << param_file;
+		NetParameter original_param(*param);
+		if (!UpgradeV1Net(original_param, param)) {
+			success = false;
+			LOG(ERROR) << "Warning: had one or more problems upgrading "
+				<< "V1LayerParameter (see above); continuing anyway.";
+		} else {
+			LOG(INFO) << "Successfully upgraded file specified using deprecated "
+				<< "V1LayerParameter";
+		}
+	}
+	return success;
 }
 
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
-  bool is_fully_compatible = true;
-  if (v1_net_param.layer_size() > 0) {
-    LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
-               << "fields; these will be ignored for the upgrade.";
-    is_fully_compatible = false;
-  }
-  net_param->CopyFrom(v1_net_param);
-  net_param->clear_layers();
-  net_param->clear_layer();
-  for (int i = 0; i < v1_net_param.layers_size(); ++i) {
-    if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
-                                 net_param->add_layer())) {
-      LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
-      is_fully_compatible = false;
-    }
-  }
-  return is_fully_compatible;
+	bool is_fully_compatible = true;
+	if (v1_net_param.layer_size() > 0) {
+		LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
+			<< "fields; these will be ignored for the upgrade.";
+		is_fully_compatible = false;
+	}
+	net_param->CopyFrom(v1_net_param);
+	net_param->clear_layers();
+	net_param->clear_layer();
+	for (int i = 0; i < v1_net_param.layers_size(); ++i) {
+		if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
+			net_param->add_layer())) {
+			LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
+			is_fully_compatible = false;
+		}
+	}
+	return is_fully_compatible;
 }
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-                             LayerParameter* layer_param) {
-  layer_param->Clear();
-  bool is_fully_compatible = true;
-  for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
-    layer_param->add_bottom(v1_layer_param.bottom(i));
-  }
-  for (int i = 0; i < v1_layer_param.top_size(); ++i) {
-    layer_param->add_top(v1_layer_param.top(i));
-  }
-  if (v1_layer_param.has_name()) {
-    layer_param->set_name(v1_layer_param.name());
-  }
-  for (int i = 0; i < v1_layer_param.include_size(); ++i) {
-    layer_param->add_include()->CopyFrom(v1_layer_param.include(i));
-  }
-  for (int i = 0; i < v1_layer_param.exclude_size(); ++i) {
-    layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i));
-  }
-  if (v1_layer_param.has_type()) {
-    layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type()));
-  }
-  for (int i = 0; i < v1_layer_param.blobs_size(); ++i) {
-    layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i));
-  }
-  for (int i = 0; i < v1_layer_param.param_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
-    layer_param->mutable_param(i)->set_name(v1_layer_param.param(i));
-  }
-  ParamSpec_DimCheckMode mode;
-  for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
-    switch (v1_layer_param.blob_share_mode(i)) {
-    case V1LayerParameter_DimCheckMode_STRICT:
-      mode = ParamSpec_DimCheckMode_STRICT;
-      break;
-    case V1LayerParameter_DimCheckMode_PERMISSIVE:
-      mode = ParamSpec_DimCheckMode_PERMISSIVE;
-      break;
-    default:
-      LOG(FATAL) << "Unknown blob_share_mode: "
-                 << v1_layer_param.blob_share_mode(i);
-      break;
-    }
-    layer_param->mutable_param(i)->set_share_mode(mode);
-  }
-  for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
-    layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i));
-  }
-  for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
-    layer_param->mutable_param(i)->set_decay_mult(
-        v1_layer_param.weight_decay(i));
-  }
-  for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) {
-    layer_param->add_loss_weight(v1_layer_param.loss_weight(i));
-  }
-  if (v1_layer_param.has_accuracy_param()) {
-    layer_param->mutable_accuracy_param()->CopyFrom(
-        v1_layer_param.accuracy_param());
-  }
-  if (v1_layer_param.has_argmax_param()) {
-    layer_param->mutable_argmax_param()->CopyFrom(
-        v1_layer_param.argmax_param());
-  }
-  if (v1_layer_param.has_concat_param()) {
-    layer_param->mutable_concat_param()->CopyFrom(
-        v1_layer_param.concat_param());
-  }
-  if (v1_layer_param.has_contrastive_loss_param()) {
-    layer_param->mutable_contrastive_loss_param()->CopyFrom(
-        v1_layer_param.contrastive_loss_param());
-  }
-  if (v1_layer_param.has_convolution_param()) {
-    layer_param->mutable_convolution_param()->CopyFrom(
-        v1_layer_param.convolution_param());
-  }
-  if (v1_layer_param.has_data_param()) {
-    layer_param->mutable_data_param()->CopyFrom(
-        v1_layer_param.data_param());
-  }
-  if (v1_layer_param.has_dropout_param()) {
-    layer_param->mutable_dropout_param()->CopyFrom(
-        v1_layer_param.dropout_param());
-  }
-  if (v1_layer_param.has_dummy_data_param()) {
-    layer_param->mutable_dummy_data_param()->CopyFrom(
-        v1_layer_param.dummy_data_param());
-  }
-  if (v1_layer_param.has_eltwise_param()) {
-    layer_param->mutable_eltwise_param()->CopyFrom(
-        v1_layer_param.eltwise_param());
-  }
-  if (v1_layer_param.has_exp_param()) {
-    layer_param->mutable_exp_param()->CopyFrom(
-        v1_layer_param.exp_param());
-  }
-  if (v1_layer_param.has_hdf5_data_param()) {
-    layer_param->mutable_hdf5_data_param()->CopyFrom(
-        v1_layer_param.hdf5_data_param());
-  }
-  if (v1_layer_param.has_hdf5_output_param()) {
-    layer_param->mutable_hdf5_output_param()->CopyFrom(
-        v1_layer_param.hdf5_output_param());
-  }
-  if (v1_layer_param.has_hinge_loss_param()) {
-    layer_param->mutable_hinge_loss_param()->CopyFrom(
-        v1_layer_param.hinge_loss_param());
-  }
-  if (v1_layer_param.has_image_data_param()) {
-    layer_param->mutable_image_data_param()->CopyFrom(
-        v1_layer_param.image_data_param());
-  }
-  if (v1_layer_param.has_infogain_loss_param()) {
-    layer_param->mutable_infogain_loss_param()->CopyFrom(
-        v1_layer_param.infogain_loss_param());
-  }
-  if (v1_layer_param.has_inner_product_param()) {
-    layer_param->mutable_inner_product_param()->CopyFrom(
-        v1_layer_param.inner_product_param());
-  }
-  if (v1_layer_param.has_lrn_param()) {
-    layer_param->mutable_lrn_param()->CopyFrom(
-        v1_layer_param.lrn_param());
-  }
-  if (v1_layer_param.has_memory_data_param()) {
-    layer_param->mutable_memory_data_param()->CopyFrom(
-        v1_layer_param.memory_data_param());
-  }
-  if (v1_layer_param.has_mvn_param()) {
-    layer_param->mutable_mvn_param()->CopyFrom(
-        v1_layer_param.mvn_param());
-  }
-  if (v1_layer_param.has_pooling_param()) {
-    layer_param->mutable_pooling_param()->CopyFrom(
-        v1_layer_param.pooling_param());
-  }
-  if (v1_layer_param.has_power_param()) {
-    layer_param->mutable_power_param()->CopyFrom(
-        v1_layer_param.power_param());
-  }
-  if (v1_layer_param.has_relu_param()) {
-    layer_param->mutable_relu_param()->CopyFrom(
-        v1_layer_param.relu_param());
-  }
-  if (v1_layer_param.has_sigmoid_param()) {
-    layer_param->mutable_sigmoid_param()->CopyFrom(
-        v1_layer_param.sigmoid_param());
-  }
-  if (v1_layer_param.has_softmax_param()) {
-    layer_param->mutable_softmax_param()->CopyFrom(
-        v1_layer_param.softmax_param());
-  }
-  if (v1_layer_param.has_slice_param()) {
-    layer_param->mutable_slice_param()->CopyFrom(
-        v1_layer_param.slice_param());
-  }
-  if (v1_layer_param.has_tanh_param()) {
-    layer_param->mutable_tanh_param()->CopyFrom(
-        v1_layer_param.tanh_param());
-  }
-  if (v1_layer_param.has_threshold_param()) {
-    layer_param->mutable_threshold_param()->CopyFrom(
-        v1_layer_param.threshold_param());
-  }
-  if (v1_layer_param.has_window_data_param()) {
-    layer_param->mutable_window_data_param()->CopyFrom(
-        v1_layer_param.window_data_param());
-  }
-  if (v1_layer_param.has_transform_param()) {
-    layer_param->mutable_transform_param()->CopyFrom(
-        v1_layer_param.transform_param());
-  }
-  if (v1_layer_param.has_loss_param()) {
-    layer_param->mutable_loss_param()->CopyFrom(
-        v1_layer_param.loss_param());
-  }
-  if (v1_layer_param.has_layer()) {
-    LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
-    is_fully_compatible = false;
-  }
-  return is_fully_compatible;
+	LayerParameter* layer_param) {
+	layer_param->Clear();
+	bool is_fully_compatible = true;
+	for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
+		layer_param->add_bottom(v1_layer_param.bottom(i));
+	}
+	for (int i = 0; i < v1_layer_param.top_size(); ++i) {
+		layer_param->add_top(v1_layer_param.top(i));
+	}
+	if (v1_layer_param.has_name()) {
+		layer_param->set_name(v1_layer_param.name());
+	}
+	for (int i = 0; i < v1_layer_param.include_size(); ++i) {
+		layer_param->add_include()->CopyFrom(v1_layer_param.include(i));
+	}
+	for (int i = 0; i < v1_layer_param.exclude_size(); ++i) {
+		layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i));
+	}
+	if (v1_layer_param.has_type()) {
+		layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type()));
+	}
+	for (int i = 0; i < v1_layer_param.blobs_size(); ++i) {
+		layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i));
+	}
+	for (int i = 0; i < v1_layer_param.param_size(); ++i) {
+		while (layer_param->param_size() <= i) {
+			layer_param->add_param();
+		}
+		layer_param->mutable_param(i)->set_name(v1_layer_param.param(i));
+	}
+	ParamSpec_DimCheckMode mode;
+	for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) {
+		while (layer_param->param_size() <= i) {
+			layer_param->add_param();
+		}
+		switch (v1_layer_param.blob_share_mode(i)) {
+			case V1LayerParameter_DimCheckMode_STRICT:
+				mode = ParamSpec_DimCheckMode_STRICT;
+				break;
+			case V1LayerParameter_DimCheckMode_PERMISSIVE:
+				mode = ParamSpec_DimCheckMode_PERMISSIVE;
+				break;
+			default:
+				LOG(FATAL) << "Unknown blob_share_mode: "
+					<< v1_layer_param.blob_share_mode(i);
+				break;
+		}
+		layer_param->mutable_param(i)->set_share_mode(mode);
+	}
+	for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) {
+		while (layer_param->param_size() <= i) {
+			layer_param->add_param();
+		}
+		layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i));
+	}
+	for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) {
+		while (layer_param->param_size() <= i) {
+			layer_param->add_param();
+		}
+		layer_param->mutable_param(i)->set_decay_mult(
+			v1_layer_param.weight_decay(i));
+	}
+	for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) {
+		layer_param->add_loss_weight(v1_layer_param.loss_weight(i));
+	}
+	if (v1_layer_param.has_accuracy_param()) {
+		layer_param->mutable_accuracy_param()->CopyFrom(
+			v1_layer_param.accuracy_param());
+	}
+	if (v1_layer_param.has_argmax_param()) {
+		layer_param->mutable_argmax_param()->CopyFrom(
+			v1_layer_param.argmax_param());
+	}
+	if (v1_layer_param.has_concat_param()) {
+		layer_param->mutable_concat_param()->CopyFrom(
+			v1_layer_param.concat_param());
+	}
+	if (v1_layer_param.has_contrastive_loss_param()) {
+		layer_param->mutable_contrastive_loss_param()->CopyFrom(
+			v1_layer_param.contrastive_loss_param());
+	}
+	if (v1_layer_param.has_convolution_param()) {
+		layer_param->mutable_convolution_param()->CopyFrom(
+			v1_layer_param.convolution_param());
+	}
+	if (v1_layer_param.has_data_param()) {
+		layer_param->mutable_data_param()->CopyFrom(
+			v1_layer_param.data_param());
+	}
+	if (v1_layer_param.has_dropout_param()) {
+		layer_param->mutable_dropout_param()->CopyFrom(
+			v1_layer_param.dropout_param());
+	}
+	if (v1_layer_param.has_dummy_data_param()) {
+		layer_param->mutable_dummy_data_param()->CopyFrom(
+			v1_layer_param.dummy_data_param());
+	}
+	if (v1_layer_param.has_eltwise_param()) {
+		layer_param->mutable_eltwise_param()->CopyFrom(
+			v1_layer_param.eltwise_param());
+	}
+	if (v1_layer_param.has_exp_param()) {
+		layer_param->mutable_exp_param()->CopyFrom(
+			v1_layer_param.exp_param());
+	}
+	if (v1_layer_param.has_hdf5_data_param()) {
+		layer_param->mutable_hdf5_data_param()->CopyFrom(
+			v1_layer_param.hdf5_data_param());
+	}
+	if (v1_layer_param.has_hdf5_output_param()) {
+		layer_param->mutable_hdf5_output_param()->CopyFrom(
+			v1_layer_param.hdf5_output_param());
+	}
+	if (v1_layer_param.has_hinge_loss_param()) {
+		layer_param->mutable_hinge_loss_param()->CopyFrom(
+			v1_layer_param.hinge_loss_param());
+	}
+	if (v1_layer_param.has_image_data_param()) {
+		layer_param->mutable_image_data_param()->CopyFrom(
+			v1_layer_param.image_data_param());
+	}
+	if (v1_layer_param.has_infogain_loss_param()) {
+		layer_param->mutable_infogain_loss_param()->CopyFrom(
+			v1_layer_param.infogain_loss_param());
+	}
+	if (v1_layer_param.has_inner_product_param()) {
+		layer_param->mutable_inner_product_param()->CopyFrom(
+			v1_layer_param.inner_product_param());
+	}
+	if (v1_layer_param.has_lrn_param()) {
+		layer_param->mutable_lrn_param()->CopyFrom(
+			v1_layer_param.lrn_param());
+	}
+	if (v1_layer_param.has_memory_data_param()) {
+		layer_param->mutable_memory_data_param()->CopyFrom(
+			v1_layer_param.memory_data_param());
+	}
+	if (v1_layer_param.has_mvn_param()) {
+		layer_param->mutable_mvn_param()->CopyFrom(
+			v1_layer_param.mvn_param());
+	}
+	if (v1_layer_param.has_pooling_param()) {
+		layer_param->mutable_pooling_param()->CopyFrom(
+			v1_layer_param.pooling_param());
+	}
+	if (v1_layer_param.has_power_param()) {
+		layer_param->mutable_power_param()->CopyFrom(
+			v1_layer_param.power_param());
+	}
+	if (v1_layer_param.has_relu_param()) {
+		layer_param->mutable_relu_param()->CopyFrom(
+			v1_layer_param.relu_param());
+	}
+	if (v1_layer_param.has_sigmoid_param()) {
+		layer_param->mutable_sigmoid_param()->CopyFrom(
+			v1_layer_param.sigmoid_param());
+	}
+	if (v1_layer_param.has_softmax_param()) {
+		layer_param->mutable_softmax_param()->CopyFrom(
+			v1_layer_param.softmax_param());
+	}
+	if (v1_layer_param.has_slice_param()) {
+		layer_param->mutable_slice_param()->CopyFrom(
+			v1_layer_param.slice_param());
+	}
+	if (v1_layer_param.has_tanh_param()) {
+		layer_param->mutable_tanh_param()->CopyFrom(
+			v1_layer_param.tanh_param());
+	}
+	if (v1_layer_param.has_threshold_param()) {
+		layer_param->mutable_threshold_param()->CopyFrom(
+			v1_layer_param.threshold_param());
+	}
+	if (v1_layer_param.has_window_data_param()) {
+		layer_param->mutable_window_data_param()->CopyFrom(
+			v1_layer_param.window_data_param());
+	}
+	if (v1_layer_param.has_transform_param()) {
+		layer_param->mutable_transform_param()->CopyFrom(
+			v1_layer_param.transform_param());
+	}
+	if (v1_layer_param.has_loss_param()) {
+		layer_param->mutable_loss_param()->CopyFrom(
+			v1_layer_param.loss_param());
+	}
+	if (v1_layer_param.has_layer()) {
+		LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
+		is_fully_compatible = false;
+	}
+	return is_fully_compatible;
 }
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
-  switch (type) {
-  case V1LayerParameter_LayerType_NONE:
-    return "";
-  case V1LayerParameter_LayerType_ABSVAL:
-    return "AbsVal";
-  case V1LayerParameter_LayerType_ACCURACY:
-    return "Accuracy";
-  case V1LayerParameter_LayerType_ARGMAX:
-    return "ArgMax";
-  case V1LayerParameter_LayerType_BNLL:
-    return "BNLL";
-  case V1LayerParameter_LayerType_CONCAT:
-    return "Concat";
-  case V1LayerParameter_LayerType_CONTRASTIVE_LOSS:
-    return "ContrastiveLoss";
-  case V1LayerParameter_LayerType_CONVOLUTION:
-    return "Convolution";
-  case V1LayerParameter_LayerType_DECONVOLUTION:
-    return "Deconvolution";
-  case V1LayerParameter_LayerType_DATA:
-    return "Data";
-  case V1LayerParameter_LayerType_DROPOUT:
-    return "Dropout";
-  case V1LayerParameter_LayerType_DUMMY_DATA:
-    return "DummyData";
-  case V1LayerParameter_LayerType_EUCLIDEAN_LOSS:
-    return "EuclideanLoss";
-  case V1LayerParameter_LayerType_ELTWISE:
-    return "Eltwise";
-  case V1LayerParameter_LayerType_EXP:
-    return "Exp";
-  case V1LayerParameter_LayerType_FLATTEN:
-    return "Flatten";
-  case V1LayerParameter_LayerType_HDF5_DATA:
-    return "HDF5Data";
-  case V1LayerParameter_LayerType_HDF5_OUTPUT:
-    return "HDF5Output";
-  case V1LayerParameter_LayerType_HINGE_LOSS:
-    return "HingeLoss";
-  case V1LayerParameter_LayerType_IM2COL:
-    return "Im2col";
-  case V1LayerParameter_LayerType_IMAGE_DATA:
-    return "ImageData";
-  case V1LayerParameter_LayerType_INFOGAIN_LOSS:
-    return "InfogainLoss";
-  case V1LayerParameter_LayerType_INNER_PRODUCT:
-    return "InnerProduct";
-  case V1LayerParameter_LayerType_LRN:
-    return "LRN";
-  case V1LayerParameter_LayerType_MEMORY_DATA:
-    return "MemoryData";
-  case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS:
-    return "MultinomialLogisticLoss";
-  case V1LayerParameter_LayerType_MVN:
-    return "MVN";
-  case V1LayerParameter_LayerType_POOLING:
-    return "Pooling";
-  case V1LayerParameter_LayerType_POWER:
-    return "Power";
-  case V1LayerParameter_LayerType_RELU:
-    return "ReLU";
-  case V1LayerParameter_LayerType_SIGMOID:
-    return "Sigmoid";
-  case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS:
-    return "SigmoidCrossEntropyLoss";
-  case V1LayerParameter_LayerType_SILENCE:
-    return "Silence";
-  case V1LayerParameter_LayerType_SOFTMAX:
-    return "Softmax";
-  case V1LayerParameter_LayerType_SOFTMAX_LOSS:
-    return "SoftmaxWithLoss";
-  case V1LayerParameter_LayerType_SPLIT:
-    return "Split";
-  case V1LayerParameter_LayerType_SLICE:
-    return "Slice";
-  case V1LayerParameter_LayerType_TANH:
-    return "TanH";
-  case V1LayerParameter_LayerType_WINDOW_DATA:
-    return "WindowData";
-  case V1LayerParameter_LayerType_THRESHOLD:
-    return "Threshold";
-  default:
-    LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type;
-    return "";
-  }
+	switch (type) {
+		case V1LayerParameter_LayerType_NONE:
+			return "";
+		case V1LayerParameter_LayerType_ABSVAL:
+			return "AbsVal";
+		case V1LayerParameter_LayerType_ACCURACY:
+			return "Accuracy";
+		case V1LayerParameter_LayerType_ARGMAX:
+			return "ArgMax";
+		case V1LayerParameter_LayerType_BNLL:
+			return "BNLL";
+		case V1LayerParameter_LayerType_CONCAT:
+			return "Concat";
+		case V1LayerParameter_LayerType_CONTRASTIVE_LOSS:
+			return "ContrastiveLoss";
+		case V1LayerParameter_LayerType_CONVOLUTION:
+			return "Convolution";
+		case V1LayerParameter_LayerType_DECONVOLUTION:
+			return "Deconvolution";
+		case V1LayerParameter_LayerType_DATA:
+			return "Data";
+		case V1LayerParameter_LayerType_DROPOUT:
+			return "Dropout";
+		case V1LayerParameter_LayerType_DUMMY_DATA:
+			return "DummyData";
+		case V1LayerParameter_LayerType_EUCLIDEAN_LOSS:
+			return "EuclideanLoss";
+		case V1LayerParameter_LayerType_ELTWISE:
+			return "Eltwise";
+		case V1LayerParameter_LayerType_EXP:
+			return "Exp";
+		case V1LayerParameter_LayerType_FLATTEN:
+			return "Flatten";
+		case V1LayerParameter_LayerType_HDF5_DATA:
+			return "HDF5Data";
+		case V1LayerParameter_LayerType_HDF5_OUTPUT:
+			return "HDF5Output";
+		case V1LayerParameter_LayerType_HINGE_LOSS:
+			return "HingeLoss";
+		case V1LayerParameter_LayerType_IM2COL:
+			return "Im2col";
+		case V1LayerParameter_LayerType_IMAGE_DATA:
+			return "ImageData";
+		case V1LayerParameter_LayerType_INFOGAIN_LOSS:
+			return "InfogainLoss";
+		case V1LayerParameter_LayerType_INNER_PRODUCT:
+			return "InnerProduct";
+		case V1LayerParameter_LayerType_LRN:
+			return "LRN";
+		case V1LayerParameter_LayerType_MEMORY_DATA:
+			return "MemoryData";
+		case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS:
+			return "MultinomialLogisticLoss";
+		case V1LayerParameter_LayerType_MVN:
+			return "MVN";
+		case V1LayerParameter_LayerType_POOLING:
+			return "Pooling";
+		case V1LayerParameter_LayerType_POWER:
+			return "Power";
+		case V1LayerParameter_LayerType_RELU:
+			return "ReLU";
+		case V1LayerParameter_LayerType_SIGMOID:
+			return "Sigmoid";
+		case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS:
+			return "SigmoidCrossEntropyLoss";
+		case V1LayerParameter_LayerType_SILENCE:
+			return "Silence";
+		case V1LayerParameter_LayerType_SOFTMAX:
+			return "Softmax";
+		case V1LayerParameter_LayerType_SOFTMAX_LOSS:
+			return "SoftmaxWithLoss";
+		case V1LayerParameter_LayerType_SPLIT:
+			return "Split";
+		case V1LayerParameter_LayerType_SLICE:
+			return "Slice";
+		case V1LayerParameter_LayerType_TANH:
+			return "TanH";
+		case V1LayerParameter_LayerType_WINDOW_DATA:
+			return "WindowData";
+		case V1LayerParameter_LayerType_THRESHOLD:
+			return "Threshold";
+		default:
+			LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type;
+			return "";
+	}
 }
 
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param) {
-  CHECK(ReadProtoFromTextFile(param_file, param))
-      << "Failed to parse NetParameter file: " << param_file;
-  UpgradeNetAsNeeded(param_file, param);
+	NetParameter* param) {
+	CHECK(ReadProtoFromTextFile(param_file, param))
+		<< "Failed to parse NetParameter file: " << param_file;
+	UpgradeNetAsNeeded(param_file, param);
 }
 
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param) {
-  CHECK(ReadProtoFromBinaryFile(param_file, param))
-      << "Failed to parse NetParameter file: " << param_file;
-  UpgradeNetAsNeeded(param_file, param);
+	NetParameter* param) {
+	CHECK(ReadProtoFromBinaryFile(param_file, param))
+		<< "Failed to parse NetParameter file: " << param_file;
+	UpgradeNetAsNeeded(param_file, param);
 }
 
 }  // namespace caffe

From f931d4cac3d78a859518b77abb0e598e5a0bfe0e Mon Sep 17 00:00:00 2001
From: Yibing <yuan.gao@noplz.name>
Date: Wed, 9 Sep 2015 18:38:06 +0800
Subject: [PATCH 070/124] Pass concat_layer & spp_layer; remove kernels in
 lrn_layer

---
 include/caffe/util/ocl_wrapper.hpp |  72 ++-
 include/caffe/vision_layers.hpp    | 144 +++---
 src/caffe/layers/lrn_layer.cpp     |  90 ++--
 src/caffe/layers/softmax_layer.cpp |   2 +-
 src/caffe/ocl/concat_layer.cl      |  48 +-
 src/caffe/ocl/lrn_layer.cl         |  12 +-
 src/caffe/ocl/pooling_layer.cl     |  24 +-
 src/caffe/util/ocl_wrapper.cpp     | 785 +++++++++++++----------------
 8 files changed, 547 insertions(+), 630 deletions(-)

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index c4149789..dbd712ea 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -286,44 +286,40 @@ void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
 template<typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
 
-template<typename Dtype>
-void LRNFillScale(cl_kernel LFSkernel, const int nthreads,
-	const Dtype* const in,
-	const int num, const int channels, const int height,
-	const int width, const int size, const Dtype alpha_over_size,
-	const Dtype k, Dtype* const scale);
-
-template<typename Dtype>
-void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
-	Dtype* scale, Dtype negative_beta, Dtype* out);
-
-template<typename Dtype>
-void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
-	const Dtype* const bottom_data, const Dtype* const top_data,
-	const Dtype* const scale, const Dtype* const top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int size, const Dtype negative_beta,
-	const Dtype cache_ratio, Dtype* const bottom_diff);
-template<typename Dtype>
-void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y);
-
-template<typename Dtype>
-void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
-
-template<typename Dtype>
-void BNLLBackward(const int count, const Dtype* top_diff,
-	const Dtype* bottom_data, Dtype *bottom_diff);
-
-template<typename Dtype>
-void Concat(const int nthreads, const Dtype* in_data, const bool forward,
-	const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, Dtype *out_data);
-
-template<typename Dtype>
+template <typename Dtype>
+void LRNFillScale(const int nthreads, const Dtype* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha_over_size,
+    const Dtype k, Dtype* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(int nthreads, const Dtype* in,
+     Dtype* scale, Dtype negative_beta, Dtype* out);
+
+template <typename Dtype>
+void LRNComputeDiff(const int nthreads,
+    const Dtype* const bottom_data, const Dtype* const top_data,
+    const Dtype* const scale, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio, Dtype* const bottom_diff);
+template <typename Dtype>
+void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
+
+template <typename Dtype>
+void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff);
+
+template <typename Dtype>
+void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data);
+
+template <typename Dtype>
 void CLLBackward(const int count, const int channels,
 	const Dtype margin, const bool legacy_version, const Dtype alpha,
 	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 9b718bd8..eb959190 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -384,83 +384,75 @@ template<typename Dtype> class SplitLayer;
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
-class LRNLayer: public Layer<Dtype> {
-	public:
-		explicit LRNLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "LRN";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-		virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int size_;
-		int pre_pad_;
-		Dtype alpha_;
-		Dtype beta_;
-		Dtype k_;
-		int num_;
-		int channels_;
-		int height_;
-		int width_;
-
-		// Fields used for normalization ACROSS_CHANNELS
-		// scale_ stores the intermediate summing results
-		Blob<Dtype> scale_;
+template <typename Dtype>
+class LRNLayer : public Layer<Dtype> {
+ public:
+  explicit LRNLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LRN"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int size_;
+  int pre_pad_;
+  Dtype alpha_;
+  Dtype beta_;
+  Dtype k_;
+  int num_;
+  int channels_;
+  int height_;
+  int width_;
+
+  // Fields used for normalization ACROSS_CHANNELS
+  // scale_ stores the intermediate summing results
+  Blob<Dtype> scale_;
+
+  // Fields used for normalization WITHIN_CHANNEL
+  shared_ptr<SplitLayer<Dtype> > split_layer_;
+  vector<Blob<Dtype>*> split_top_vec_;
+  shared_ptr<PowerLayer<Dtype> > square_layer_;
+  Blob<Dtype> square_input_;
+  Blob<Dtype> square_output_;
+  vector<Blob<Dtype>*> square_bottom_vec_;
+  vector<Blob<Dtype>*> square_top_vec_;
+  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+  Blob<Dtype> pool_output_;
+  vector<Blob<Dtype>*> pool_top_vec_;
+  shared_ptr<PowerLayer<Dtype> > power_layer_;
+  Blob<Dtype> power_output_;
+  vector<Blob<Dtype>*> power_top_vec_;
+  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+  Blob<Dtype> product_input_;
+  vector<Blob<Dtype>*> product_bottom_vec_;
 
-		// Fields used for normalization WITHIN_CHANNEL
-		shared_ptr<SplitLayer<Dtype> > split_layer_;
-		vector<Blob<Dtype>*> split_top_vec_;
-		shared_ptr<PowerLayer<Dtype> > square_layer_;
-		Blob<Dtype> square_input_;
-		Blob<Dtype> square_output_;
-		vector<Blob<Dtype>*> square_bottom_vec_;
-		vector<Blob<Dtype>*> square_top_vec_;
-		shared_ptr<PoolingLayer<Dtype> > pool_layer_;
-		Blob<Dtype> pool_output_;
-		vector<Blob<Dtype>*> pool_top_vec_;
-		shared_ptr<PowerLayer<Dtype> > power_layer_;
-		Blob<Dtype> power_output_;
-		vector<Blob<Dtype>*> power_top_vec_;
-		shared_ptr<EltwiseLayer<Dtype> > product_layer_;
-		Blob<Dtype> product_input_;
-		vector<Blob<Dtype>*> product_bottom_vec_;
-
-		cl_kernel LFSkernel, LCDkernel, LCOkernel;
 };
 
 /*n
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 0f936f22..58f835b6 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -70,29 +70,26 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template<typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
-	num_ = bottom[0]->num();
-	channels_ = bottom[0]->channels();
-	height_ = bottom[0]->height();
-	width_ = bottom[0]->width();
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			top[0]->Reshape(num_, channels_, height_, width_);
-			scale_.Reshape(num_, channels_, height_, width_);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			split_layer_->Reshape(bottom, split_top_vec_);
-			square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
-			pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
-			power_layer_->Reshape(pool_top_vec_, power_top_vec_);
-			product_layer_->Reshape(product_bottom_vec_, top);
-			break;
-	}
-	LFSkernel = clCreateKernel(amdDevice.Program, "LRNFillScalefloat", NULL);
-	LCDkernel = clCreateKernel(amdDevice.Program, "LRNComputeDifffloat", NULL);
-	LCOkernel = clCreateKernel(amdDevice.Program, "LRNComputeOutputfloat", NULL);
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    top[0]->Reshape(num_, channels_, height_, width_);
+    scale_.Reshape(num_, channels_, height_, width_);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    split_layer_->Reshape(bottom, split_top_vec_);
+    square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
+    pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
+    power_layer_->Reshape(pool_top_vec_, power_top_vec_);
+    product_layer_->Reshape(product_bottom_vec_, top);
+    break;
+  }
 }
 
 template<typename Dtype>
@@ -254,35 +251,32 @@ void LRNLayer<Dtype>::WithinChannelBackward(
 
 template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// First, compute scale
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	Dtype* scale_data = scale_.mutable_gpu_data();
-	// We will launch one kernel for each pixel location, and have the kernel
-	// go through all the channels.
-	int n_threads = num_ * height_ * width_;
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNFillScale(LFSkernel,
-		n_threads, bottom_data, num_, channels_, height_, width_, size_,
-		alpha_ / size_, k_, scale_data);
-	n_threads = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNComputeOutput(LCOkernel,
-		n_threads, bottom_data, scale_data, -beta_, top_data);
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // First, compute scale
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  // We will launch one kernel for each pixel location, and have the kernel
+  // go through all the channels.
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
+      alpha_ / size_, k_, scale_data);
+  n_threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
 }
 
 template<typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
-	int n_threads = num_ * height_ * width_;
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNComputeDiff(LCDkernel,
-		n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-		scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-		size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-		bottom[0]->mutable_gpu_diff());
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      bottom[0]->mutable_gpu_diff());
 }
 
 template<typename Dtype>
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 117a966f..24d1e4b8 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -1,4 +1,4 @@
-s#include <algorithm>
+#include <algorithm>
 #include <vector>
 
 #include "caffe/layer.hpp"
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
index de504dec..2c2c76ee 100644
--- a/src/caffe/ocl/concat_layer.cl
+++ b/src/caffe/ocl/concat_layer.cl
@@ -26,29 +26,29 @@
 
 template <class T>
 __kernel void Concat(const int nthreads, __global const T* in_data,
-	const bool forward, const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, __global T* out_data) {
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		const int total_concat_size = concat_size * bottom_concat_axis;
-		const int concat_num = index / total_concat_size;
-		const int concat_index = index % total_concat_size;
-		const int top_index = concat_index +
-		(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-		if (forward) {
-			out_data[top_index] = in_data[index];
-		} else {
-			out_data[index] = in_data[top_index];
-		}
-	}
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global T* out_data) {
+    int index = get_global_id(0);
+    if(index < nthreads) {
+        const int total_concat_size = concat_size * bottom_concat_axis;
+        const int concat_num = index / total_concat_size;
+        const int concat_index = index % total_concat_size;
+        const int top_index = concat_index +
+            (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+        if (forward == 1) {
+            out_data[top_index] = in_data[index];
+        } else {
+            out_data[index] = in_data[top_index];
+        }
+    }
 }
 
-template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data,
-	const bool forward, const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, __global float* out_data);
-template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data,
-	const bool forward, const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, __global double* out_data);
+template __attribute__((mangled_name(Concat_float))) __kernel void  Concat(const int nthreads, __global const float* in_data,
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global float* out_data);
+template __attribute__((mangled_name(Concat_double))) __kernel void  Concat(const int nthreads, __global const double* in_data,
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
index 620bad72..1a53f772 100644
--- a/src/caffe/ocl/lrn_layer.cl
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -31,8 +31,8 @@ __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* s
 	for(index; index < nthreads; index += tmp)
 	out[index] = in[index] * pow(scale[index], negative_beta);
 }
-template __attribute__((mangled_name(LRNComputeOutputfloat))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
-template __attribute__((mangled_name(LRNComputeOutputdouble))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
+template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
+template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
 
 template <class T>
 __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
@@ -78,8 +78,8 @@ __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, co
 		}
 	}
 }
-template __attribute__((mangled_name(LRNFillScalefloat))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale);
-template __attribute__((mangled_name(LRNFillScaledouble))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
+template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
+template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
 
 template <class T>
 __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
@@ -135,5 +135,5 @@ __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __glob
 	}
 }
 
-template __attribute__((mangled_name(LRNComputeDifffloat))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
-template __attribute__((mangled_name(LRNComputeDiffdouble))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiff_double))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 11352e16..3162b92e 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -279,15 +279,15 @@ __kernel void StoPoolBackward(const int nthreads,
 
 	}
 }
-template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward<float>(const int nthreads,
-	__global float* rand_idx, __global float* top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int pooled_height, const int pooled_width,
-	const int kernel_h, const int kernel_w, const int stride_h,
-	const int stride_w, __global float* bottom_diff);
-template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward<double>(const int nthreads,
-	__global double* rand_idx, __global double* top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int pooled_height, const int pooled_width,
-	const int kernel_h, const int kernel_w, const int stride_h,
-	const int stride_w, __global double* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel  void StoPoolBackward(const int nthreads,
+    __global float* rand_idx, __global float* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global float* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads,
+    __global double* rand_idx, __global double* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index f7cf9c07..be0c5894 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1338,431 +1338,366 @@ template void TanHBackward<double>(const int count, const double* top_diff,
 
 template<typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-	const int height, const int width, Dtype* data_opt, const int opt_offset,
-	const int optnum) {
-	std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int num_kernels = channels * height * width * optnum;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-
-template void opttrans<float>(const float* data_im, const int im_offset,
-	const int channels,
-	const int height, const int width, float* data_opt, const int opt_offset,
-	const int optnum);
-template void opttrans<double>(const double* data_im, const int im_offset,
-	const int channels,
-	const int height, const int width, double* data_opt, const int opt_offset,
-	const int optnum);
-
-template<typename Dtype>
-void LRNFillScale(cl_kernel LFSkernel, const int nthreads,
-	const Dtype* const in,
-	const int num, const int channels, const int height,
-	const int width, const int size, const Dtype alpha_over_size,
-	const Dtype k, Dtype* const scale) {
-	cl_int ret;
-	ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
-	ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
-	ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
-	ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
-	ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-template void LRNFillScale<float>(cl_kernel kernel, const int nthreads,
-	const float* const in,
-	const int num, const int channels, const int height,
-	const int width, const int size, const float alpha_over_size,
-	const float k, float* const scale);
-template void LRNFillScale<double>(cl_kernel kernel, const int nthreads,
-	const double* const in,
-	const int num, const int channels, const int height,
-	const int width, const int size, const double alpha_over_size,
-	const double k, double* const scale);
-
-template<typename Dtype>
-void LRNComputeOutput(cl_kernel LCOkernel, int nthreads, const Dtype* in,
-	Dtype* scale, Dtype negative_beta, Dtype* out) {
-	cl_int ret;
-	ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
-	ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
-	ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
-	ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size2[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
-			uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
-}
-template void LRNComputeOutput<float>(cl_kernel kernel, int nthreads,
-	const float* in,
-	float* scale, float negative_beta, float* out);
-template void LRNComputeOutput<double>(cl_kernel kernel, int nthreads,
-	const double* in,
-	double* scale, double negative_beta, double* out);
-
-template<typename Dtype>
-void LRNComputeDiff(cl_kernel LCDkernel, const int nthreads,
-	const Dtype* const bottom_data, const Dtype* const top_data,
-	const Dtype* const scale, const Dtype* const top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int size, const Dtype negative_beta,
-	const Dtype cache_ratio, Dtype* const bottom_diff) {
-	cl_int ret;
-	ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
-	ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
-	ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
-	ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
-	ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-template void LRNComputeDiff<float>(cl_kernel kernel, const int nthreads,
-	const float* const bottom_data, const float* const top_data,
-	const float* const scale, const float* const top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int size, const float negative_beta,
-	const float cache_ratio, float* const bottom_diff);
-template void LRNComputeDiff<double>(cl_kernel kernel, const int nthreads,
-	const double* const bottom_data, const double* const top_data,
-	const double* const scale, const double* const top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int size, const double negative_beta,
-	const double cache_ratio, double* const bottom_diff);
-
-template<typename Dtype>
-void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
-	std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add<float>(const int n, const float* in1,
-	const float* in2, float* y);
-template void caffe_gpu_add<double>(const int n, const double* in1,
-	const double* in2, double* y);
-
-template<typename Dtype>
-void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
-	std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) N };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
-template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
-	double* Y);
-
-template<typename Dtype>
-void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
-	std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) N };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_abs_ocl<float>(const int N, const float* X, float* Y);
-template void caffe_gpu_abs_ocl<double>(const int N, const double* X,
-	double* Y);
-
-template<typename Dtype>
-void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
-	std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_div<float>(const int n, const float* a, const float* b,
-	float* y);
-template void caffe_gpu_div<double>(const int n, const double* a,
-	const double* b, double* y);
-
-template<typename Dtype>
-void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) {
-	std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add_scalar<float>(const int n, const float alpha,
-	float* top_data);
-template void caffe_gpu_add_scalar<double>(const int n, const double alpha,
-	double* top_data);
-
-template<typename Dtype>
-void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
-	std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_mul<float>(const int n, const float* a, const float* b,
-	float* y);
-template void caffe_gpu_mul<double>(const int n, const double* a,
-	const double* b, double* y);
-
-template<typename Dtype>
-void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) {
-	std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_powx<float>(const int n, const float* a,
-	const float alpha, float* y);
-template void caffe_gpu_powx<double>(const int n, const double* a,
-	const double alpha, double* y);
-
-template<typename Dtype>
-void DropoutForward(const int count, const Dtype* bottom_data,
-	const int* MaskMem, const Dtype scale_, Dtype* top_data)
-	{
-	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void DropoutForward<float>(const int count, const float* bottom_data,
-	const int* MaskMem, const float scale_, float* top_data);
-template void DropoutForward<double>(const int count, const double* bottom_data,
-	const int* MaskMem, const double scale_, double* top_data);
-
-template<typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-	const float threshold_, const Dtype scale_, Dtype* bottom_diff)
-	{
-	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void DropoutBackward<float>(const int count, const float* top_diff,
-	const int* MaskMem, const float threshold_, const float scale_,
-	float* bottom_diff);
-template void DropoutBackward<double>(const int count, const double* top_diff,
-	const int* MaskMem, const float threshold_, const double scale_,
-	double* bottom_diff);
-
-template<typename Dtype>
-void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
-	{
-	std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void BNLLForward<float>(const int count, const float* bottom_data,
-	float *top_data);
-template void BNLLForward<double>(const int count, const double* bottom_data,
-	double *top_data);
-
-template<typename Dtype>
-void BNLLBackward(const int count, const Dtype* top_diff,
-	const Dtype* bottom_data, Dtype *bottom_diff)
-	{
-	std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void BNLLBackward<float>(const int count, const float* top_diff,
-	const float* bottom_data, float *bottom_diff);
-template void BNLLBackward<double>(const int count, const double* top_diff,
-	const double* bottom_data, double *bottom_diff);
-
-template<typename Dtype>
-void Concat(const int nthreads, const Dtype* in_data, const bool forward,
-	const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, Dtype *out_data)
-	{
-	std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_bool), (void*) &forward);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
-	ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
-	ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
-	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) nthreads };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-template void Concat<float>(const int nthreads, const float* in_data,
-	const bool forward, const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, float *out_data);
-template void Concat<double>(const int nthreads, const double* in_data,
-	const bool forward, const int num_concats, const int concat_size,
-	const int top_concat_axis, const int bottom_concat_axis,
-	const int offset_concat_axis, double *out_data);
-
-template<typename Dtype>
+    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
+    std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    int num_kernels = channels * height * width * optnum;
+
+    cl_int ret;
+    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
+    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
+    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
+    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
+    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
+    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
+    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
+    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
+    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
+    OCL_CHECK(ret);
+
+    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
+    size_t uiLocal_Work_Size[] = {256};
+    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+}
+
+template void opttrans<float>(const float* data_im, const int im_offset, const int channels,
+    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
+template void opttrans<double>(const double* data_im, const int im_offset, const int channels,
+    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
+
+template <typename Dtype>
+void LRNFillScale(const int nthreads, const Dtype* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype alpha_over_size,
+    const Dtype k, Dtype* const scale){
+  std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
+  cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in);
+  ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num);
+  ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels);
+  ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height);
+  ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width);
+  ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size);
+  ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size);
+  ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k);
+  ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
+  size_t uiLocal_Work_Size[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) );
+}
+template void LRNFillScale<float>(const int nthreads, const float* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const float alpha_over_size,
+    const float k, float* const scale);
+template void LRNFillScale<double>(const int nthreads, const double* const in,
+    const int num, const int channels, const int height,
+    const int width, const int size, const double alpha_over_size,
+    const double k, double* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(int nthreads, const Dtype* in,
+     Dtype* scale, Dtype negative_beta, Dtype* out){
+  std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
+  cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
+  ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale);
+  ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size2[]={(size_t)nthreads};
+  size_t uiLocal_Work_Size2[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
+}
+template void LRNComputeOutput<float>(int nthreads, const float* in,
+    float* scale, float negative_beta, float* out);
+template void LRNComputeOutput<double>(int nthreads, const double* in,
+    double* scale, double negative_beta, double* out);
+
+template <typename Dtype>
+void LRNComputeDiff(const int nthreads,
+    const Dtype* const bottom_data, const Dtype* const top_data,
+    const Dtype* const scale, const Dtype* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const Dtype negative_beta,
+    const Dtype cache_ratio, Dtype* const bottom_diff){
+  std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
+  cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads);
+  ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data);
+  ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data);
+  ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale);
+  ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff);
+  ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num);
+  ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels);
+  ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height);
+  ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width);
+  ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size);
+  ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta);
+  ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio);
+  ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
+  size_t uiLocal_Work_Size[]={256};
+  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) );
+}
+template void LRNComputeDiff<float>(const int nthreads,
+    const float* const bottom_data, const float* const top_data,
+    const float* const scale, const float* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const float negative_beta,
+    const float cache_ratio, float* const bottom_diff);
+template void LRNComputeDiff<double>(const int nthreads,
+    const double* const bottom_data, const double* const top_data,
+    const double* const scale, const double* const top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int size, const double negative_beta,
+    const double cache_ratio, double* const bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
+    std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add<float> (const int n, const float* in1, const float* in2, float* y);
+template void caffe_gpu_add<double> (const int n, const double* in1, const double* in2, double* y);
+
+template <typename Dtype>
+void caffe_gpu_sign_ocl(const int N,  const Dtype* X, Dtype * Y ){
+    std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)N};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_ocl<float>(const int N,  const float* X, float* Y );
+template void caffe_gpu_sign_ocl<double>(const int N,  const double* X, double* Y );
+
+template <typename Dtype>
+void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y ){
+    std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)N};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_abs_ocl<float>(const int N,  const float* X, float* Y );
+template void caffe_gpu_abs_ocl<double>(const int N,  const double* X, double* Y );
+
+template <typename Dtype>
+void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
+    std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_div<float> (const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_div<double> (const int n, const double* a, const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
+     std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add_scalar<float> (const int n, const float alpha, float* top_data);
+template void caffe_gpu_add_scalar<double> (const int n, const double alpha, double* top_data);
+
+template <typename Dtype>
+void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
+        std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float> (const int n, const float* a, const float* b, float* y);
+template void caffe_gpu_mul<double> (const int n, const double* a, const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
+       std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
+    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    cl_int ret;
+    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
+    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
+    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
+    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
+    OCL_CHECK(ret);
+    size_t Global_Work_Size[] = {(size_t)n};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float> (const int n, const float* a, const float alpha, float* y);
+template void caffe_gpu_powx<double> (const int n, const double* a, const double alpha, double* y);
+
+template <typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
+{
+    std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
+    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
+    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
+    ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_);
+    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void DropoutForward<float>(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
+template void DropoutForward<double>(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+
+template <typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
+{
+    std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
+    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_);
+    ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_);
+    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
+template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
+
+
+template <typename Dtype>
+void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
+{
+    std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&bottom_data);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&top_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  BNLLForward<float>(const int count, const float* bottom_data, float *top_data);
+template void  BNLLForward<double>(const int count, const double* bottom_data, double *top_data);
+
+template <typename Dtype>
+void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff)
+{
+    std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+    cl_int ret;
+    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
+    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
+    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&bottom_data);
+    ret |= clSetKernelArg(kernel,3,sizeof(cl_mem),  (void*)&bottom_diff);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)count};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  BNLLBackward<float>(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff);
+template void  BNLLBackward<double>(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff);
+
+
+template <typename Dtype>
+void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data)
+{
+    std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+    int k_forward = (forward == true)? 1 : 0;
+    cl_int ret;
+    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
+    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&in_data);
+    ret |= clSetKernelArg(kernel, 2, sizeof(cl_int),  (void*)&k_forward);
+    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&num_concats);
+    ret |= clSetKernelArg(kernel, 4, sizeof(cl_int),  (void*)&concat_size);
+    ret |= clSetKernelArg(kernel, 5, sizeof(cl_int),  (void*)&top_concat_axis);
+    ret |= clSetKernelArg(kernel, 6, sizeof(cl_int),  (void*)&bottom_concat_axis); 
+    ret |= clSetKernelArg(kernel, 7, sizeof(cl_int),  (void*)&offset_concat_axis);
+    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&out_data);
+    OCL_CHECK(ret);
+
+    size_t Global_Work_Size[] = {(size_t)nthreads};
+    size_t Local_Work_Size[] = {256};
+    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void  Concat<float>(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data);
+template void  Concat<double>(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int  concat_size,
+        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data);
+
+template <typename Dtype>
 void CLLBackward(const int count, const int channels,
 	const Dtype margin, const bool legacy_version, const Dtype alpha,
 	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,

From 432dd92135ae79c4625ad9f1fbf0bf3de1379478 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 10 Sep 2015 00:01:30 +0800
Subject: [PATCH 071/124] Fix the bug that CPU mode cannot run

---
 src/caffe/common.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index a6ea3a57..22e9059b 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -91,6 +91,7 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
 {
+        amdDevice.Init();
 	cl_int err = clblasSetup();
 	if (err != CL_SUCCESS) {
 		LOG(ERROR) << "clBLAS setup failed " << err;

From e45e90082b4db3624f2be38a73e36d156b14ddc2 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 10 Sep 2015 01:58:56 +0800
Subject: [PATCH 072/124] update Readme and License file

---
 LICENSE   |  6 ++++++
 README.md | 15 +++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/LICENSE b/LICENSE
index d69d16f5..ca91d911 100644
--- a/LICENSE
+++ b/LICENSE
@@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT
 By contributing to the BVLC/caffe repository through pull-request, comment,
 or otherwise, the contributor releases their content to the
 license and copyright terms herein.
+
+AMD license on the OpenCL parts
+
+AMD holds license for the OpenCL related code, kernels and optimizations. 
+AMD license is added to the file or part of the file that written by AMD.
+For details, please see license declaration for individual file.
diff --git a/README.md b/README.md
index ebec286d..8b47341f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,18 @@
+#OpenCL caffe
+
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework.
+
+OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
+
+#Design features
+-All layers ported to OpenCL
+-Passes unit test
+-Performance improvement by batched sgemm implementation for conv layer
+-User can choose optimal batch number depening on H/W and image size
+-Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
+-Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
+
+
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.

From b14dac2d2f334074339c83571fbc44744889aef8 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 10 Sep 2015 01:58:56 +0800
Subject: [PATCH 073/124] update Readme and License file

---
 LICENSE   |  6 ++++++
 README.md | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/LICENSE b/LICENSE
index d69d16f5..ca91d911 100644
--- a/LICENSE
+++ b/LICENSE
@@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT
 By contributing to the BVLC/caffe repository through pull-request, comment,
 or otherwise, the contributor releases their content to the
 license and copyright terms herein.
+
+AMD license on the OpenCL parts
+
+AMD holds license for the OpenCL related code, kernels and optimizations. 
+AMD license is added to the file or part of the file that written by AMD.
+For details, please see license declaration for individual file.
diff --git a/README.md b/README.md
index ebec286d..a3fd8497 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,23 @@
+#OpenCL caffe
+
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework.
+
+OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
+
+#Design features
+-All layers ported to OpenCL
+
+-Passes unit test
+
+-Performance improvement by batched sgemm implementation for conv layer
+
+-User can choose optimal batch number depening on H/W, image size and minibatch size
+
+-Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
+
+-Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
+
+
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.

From b5792c317898554cbbc0dab99d8810c8ac55838d Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:25:52 -0700
Subject: [PATCH 074/124] Update README.md

---
 README.md | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a3fd8497..1bc5d0c6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework.
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 
@@ -17,6 +17,26 @@ OpenCL is an open standard parallel programming language that is supported by mo
 
 -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
 
+Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need.
+
+#Performance
+
+We will keep updating the latest performance we could achieve in this section.
+
+Training speed (Model: AlexNet)
+-AMD W9100 (5.2TFLOPS), 255 images per second
+-AMD R9 Fury((5.2TFLOPS)), 231 images per second
+
+Recognition speed (Model: AlexNet)
+-AMD W9100 (5.2TFLOPS), 590 images per second
+-AMD R9 Fury((5.2TFLOPS)), 699 images per second
+
+#Wiki
+For more information on how to install, use or contribute to this code base, please visit our wiki page:
+https://github.com/amd/OpenCL-caffe/wiki
+
+#License and support
+Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
 # Caffe
 

From 947aa9a0328205d0ac62906ed9975c886ade5c86 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:27:15 -0700
Subject: [PATCH 075/124] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 1bc5d0c6..bf7b2dcd 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,12 @@ We will keep updating the latest performance we could achieve in this section.
 
 Training speed (Model: AlexNet)
 -AMD W9100 (5.2TFLOPS), 255 images per second
+
 -AMD R9 Fury((5.2TFLOPS)), 231 images per second
 
 Recognition speed (Model: AlexNet)
 -AMD W9100 (5.2TFLOPS), 590 images per second
+
 -AMD R9 Fury((5.2TFLOPS)), 699 images per second
 
 #Wiki

From 51872ffba96f2c196b53e9bae33f6a8c5225a8eb Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:27:55 -0700
Subject: [PATCH 076/124] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index bf7b2dcd..27a8f87f 100644
--- a/README.md
+++ b/README.md
@@ -24,11 +24,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only
 We will keep updating the latest performance we could achieve in this section.
 
 Training speed (Model: AlexNet)
+
 -AMD W9100 (5.2TFLOPS), 255 images per second
 
 -AMD R9 Fury((5.2TFLOPS)), 231 images per second
 
 Recognition speed (Model: AlexNet)
+
 -AMD W9100 (5.2TFLOPS), 590 images per second
 
 -AMD R9 Fury((5.2TFLOPS)), 699 images per second

From af514ad3f8950ae941ba14fc4b926022c2104af8 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:28:29 -0700
Subject: [PATCH 077/124] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27a8f87f..ef4ae50d 100644
--- a/README.md
+++ b/README.md
@@ -27,13 +27,13 @@ Training speed (Model: AlexNet)
 
 -AMD W9100 (5.2TFLOPS), 255 images per second
 
--AMD R9 Fury((5.2TFLOPS)), 231 images per second
+-AMD R9 Fury((7.2TFLOPS)), 231 images per second
 
 Recognition speed (Model: AlexNet)
 
 -AMD W9100 (5.2TFLOPS), 590 images per second
 
--AMD R9 Fury((5.2TFLOPS)), 699 images per second
+-AMD R9 Fury((7.2TFLOPS)), 699 images per second
 
 #Wiki
 For more information on how to install, use or contribute to this code base, please visit our wiki page:

From 49ecf7c6970252267b9331f2e77de7d5adaf5774 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:41:38 -0700
Subject: [PATCH 078/124] Update README.md

---
 README.md | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 8b47341f..54884f21 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,46 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research. The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework.
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 
 #Design features
--All layers ported to OpenCL
--Passes unit test
--Performance improvement by batched sgemm implementation for conv layer
--User can choose optimal batch number depening on H/W and image size
--Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
--Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
+  -All layers ported to OpenCL
 
+  -Passes unit test
+
+  -Performance improvement by batched sgemm implementation for conv layer
+
+  -User can choose optimal batch number depening on H/W, image size and minibatch size
+
+  -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
+
+  -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
+
+Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need.
+
+#Performance
+
+We will keep updating the latest performance we could achieve in this section.
+
+* Training speed (Model: AlexNet)
+
+    -AMD W9100 (5.2TFLOPS), 255 images per second
+
+    -AMD R9 Fury((7.2TFLOPS)), 231 images per second
+
+* Recognition speed (Model: AlexNet)
+
+    -AMD W9100 (5.2TFLOPS), 590 images per second
+
+    -AMD R9 Fury((7.2TFLOPS)), 699 images per second
+
+#Wiki
+For more information on how to install, use or contribute to this code base, please visit our wiki page:
+https://github.com/amd/OpenCL-caffe/wiki
+
+#License and support
+Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
 # Caffe
 

From dc1f82aee029e1864d596eaed3882830dd7aed0c Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:43:06 -0700
Subject: [PATCH 079/124] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 54884f21..5d2692a9 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,8 @@ https://github.com/amd/OpenCL-caffe/wiki
 #License and support
 Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
-# Caffe
+# Oroginal Caffe information
+## Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
 It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.

From 15e5dc500a1fbebb5d40acaecbec6c23b251feb5 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:45:01 -0700
Subject: [PATCH 080/124] Update README.md

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5d2692a9..c75d9e1d 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,17 @@ OpenCL is an open standard parallel programming language that is supported by mo
 
 #Design features
   -All layers ported to OpenCL
-
-  -Passes unit test
+  
+  -Aligned with CAFFE’s latest code
 
   -Performance improvement by batched sgemm implementation for conv layer
 
   -User can choose optimal batch number depening on H/W, image size and minibatch size
 
+  -Passes unit test
+
+  -OpenCL 2.0, 1.2
+  
   -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
 
   -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19

From 4036485a4458f774b5c25dee1be3e02d5577d11d Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 15:45:41 -0700
Subject: [PATCH 081/124] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c75d9e1d..8fadd98f 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ https://github.com/amd/OpenCL-caffe/wiki
 #License and support
 Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
-# Oroginal Caffe information
+# Original Caffe information
 ## Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.

From 20b4a89f4895297a7eabe884230bbde2a7939707 Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao@noplz.name>
Date: Thu, 10 Sep 2015 11:16:12 +0800
Subject: [PATCH 082/124] Adjust the code style

---
 include/caffe/blob.hpp                        |   28 +-
 include/caffe/common_layers.hpp               |  214 ++-
 include/caffe/data_layers.hpp                 |  112 +-
 include/caffe/data_transformer.hpp            |    6 +-
 include/caffe/device.hpp                      |    9 +-
 include/caffe/filler.hpp                      |   72 +-
 include/caffe/internal_thread.hpp             |    3 +-
 include/caffe/layer.hpp                       |   75 +-
 include/caffe/layer_factory.hpp               |   14 +-
 include/caffe/loss_layers.hpp                 |  124 +-
 include/caffe/net.hpp                         |   20 +-
 include/caffe/neuron_layers.hpp               |  191 +-
 include/caffe/python_layer.hpp                |   14 +-
 include/caffe/solver.hpp                      |   30 +-
 include/caffe/syncedmem.hpp                   |   10 +-
 include/caffe/util/cudnn.hpp                  |   26 +-
 include/caffe/util/db_leveldb.hpp             |   11 +-
 include/caffe/util/db_lmdb.hpp                |   13 +-
 include/caffe/util/im2col.hpp                 |  106 +-
 include/caffe/util/insert_splits.hpp          |    8 +-
 include/caffe/util/io.hpp                     |   44 +-
 include/caffe/util/math_functions.hpp         |  198 +-
 include/caffe/util/mkl_alternate.hpp          |    8 +-
 include/caffe/util/ocl_util.hpp               |    4 +-
 include/caffe/util/ocl_wrapper.hpp            |  318 ++--
 include/caffe/util/rng.hpp                    |    6 +-
 include/caffe/util/upgrade_proto.hpp          |   10 +-
 include/caffe/vision_layers.hpp               |  304 ++--
 src/caffe/blob.cpp                            |  128 +-
 src/caffe/common.cpp                          |   14 +-
 src/caffe/data_transformer.cpp                |   92 +-
 src/caffe/device.cpp                          |  127 +-
 src/caffe/internal_thread.cpp                 |    2 +-
 src/caffe/layer_factory.cpp                   |   18 +-
 src/caffe/layers/absval_layer.cpp             |   22 +-
 src/caffe/layers/accuracy_layer.cpp           |   34 +-
 src/caffe/layers/argmax_layer.cpp             |   20 +-
 src/caffe/layers/base_conv_layer.cpp          |  252 +--
 src/caffe/layers/base_data_layer.cpp          |   52 +-
 src/caffe/layers/bnll_layer.cpp               |   26 +-
 src/caffe/layers/concat_layer.cpp             |   48 +-
 src/caffe/layers/contrastive_loss_layer.cpp   |  104 +-
 src/caffe/layers/conv_layer.cpp               |   66 +-
 src/caffe/layers/data_layer.cpp               |   14 +-
 src/caffe/layers/deconv_layer.cpp             |   34 +-
 src/caffe/layers/dropout_layer.cpp            |   43 +-
 src/caffe/layers/dummy_data_layer.cpp         |   46 +-
 src/caffe/layers/eltwise_layer.cpp            |   46 +-
 src/caffe/layers/euclidean_loss_layer.cpp     |   58 +-
 src/caffe/layers/exp_layer.cpp                |   24 +-
 src/caffe/layers/filter_layer.cpp             |   48 +-
 src/caffe/layers/flatten_layer.cpp            |   16 +-
 src/caffe/layers/hdf5_data_layer.cpp          |   47 +-
 src/caffe/layers/hdf5_output_layer.cpp        |   58 +-
 src/caffe/layers/hinge_loss_layer.cpp         |   12 +-
 src/caffe/layers/im2col_layer.cpp             |   68 +-
 src/caffe/layers/image_data_layer.cpp         |   28 +-
 src/caffe/layers/infogain_loss_layer.cpp      |   26 +-
 src/caffe/layers/inner_product_layer.cpp      |   80 +-
 src/caffe/layers/log_layer.cpp                |   28 +-
 src/caffe/layers/loss_layer.cpp               |   10 +-
 src/caffe/layers/lrn_layer.cpp                |  180 +-
 src/caffe/layers/memory_data_layer.cpp        |   32 +-
 .../multinomial_logistic_loss_layer.cpp       |   20 +-
 src/caffe/layers/mvn_layer.cpp                |  138 +-
 src/caffe/layers/neuron_layer.cpp             |    4 +-
 src/caffe/layers/pooling_layer.cpp            |  118 +-
 src/caffe/layers/power_layer.cpp              |   26 +-
 src/caffe/layers/prelu_layer.cpp              |   56 +-
 src/caffe/layers/reduction_layer.cpp          |   42 +-
 src/caffe/layers/relu_layer.cpp               |   24 +-
 src/caffe/layers/reshape_layer.cpp            |   32 +-
 .../sigmoid_cross_entropy_loss_layer.cpp      |   32 +-
 src/caffe/layers/sigmoid_layer.cpp            |   20 +-
 src/caffe/layers/silence_layer.cpp            |   16 +-
 src/caffe/layers/slice_layer.cpp              |   46 +-
 src/caffe/layers/softmax_layer.cpp            |   53 +-
 src/caffe/layers/softmax_loss_layer.cpp       |   52 +-
 src/caffe/layers/split_layer.cpp              |   28 +-
 src/caffe/layers/spp_layer.cpp                |   50 +-
 src/caffe/layers/tanh_layer.cpp               |   18 +-
 src/caffe/layers/threshold_layer.cpp          |   12 +-
 src/caffe/layers/window_data_layer.cpp        |  101 +-
 src/caffe/net.cpp                             |  274 +--
 src/caffe/ocl/bnll_layer.cl                   |    6 +-
 src/caffe/ocl/concat_layer.cl                 |   48 +-
 src/caffe/ocl/contrastive_loss_layer.cl       |   18 +-
 src/caffe/ocl/eltwise_layer.cl                |   18 +-
 src/caffe/ocl/im2col.cl                       |   48 +-
 src/caffe/ocl/lrn_layer.cl                    |    2 +-
 src/caffe/ocl/pooling_layer.cl                |   42 +-
 src/caffe/ocl/prelu_layer.cl                  |    2 +-
 src/caffe/ocl/random.cl                       |   26 +-
 src/caffe/ocl/softmax_layer.cl                |   42 +-
 src/caffe/ocl/softmaxwithloss_layer.cl        |   44 +-
 src/caffe/solver.cpp                          |  274 +--
 src/caffe/syncedmem.cpp                       |   29 +-
 src/caffe/util/benchmark.cpp                  |   11 +-
 src/caffe/util/db_leveldb.cpp                 |    2 +-
 src/caffe/util/im2col.cpp                     |  209 +--
 src/caffe/util/im2col.cu                      |   76 +-
 src/caffe/util/insert_splits.cpp              |   26 +-
 src/caffe/util/io.cpp                         |   52 +-
 src/caffe/util/math_functions.cpp             |  596 +++---
 src/caffe/util/math_functions.cu              |  148 +-
 src/caffe/util/ocl_util.cpp                   |   26 +-
 src/caffe/util/ocl_wrapper.cpp                | 1611 +++++++++--------
 src/caffe/util/upgrade_proto.cpp              |  224 +--
 108 files changed, 4399 insertions(+), 4224 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index e55ce8e6..26a75558 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -21,21 +21,22 @@ namespace caffe {
  *
  * TODO(dox): more thorough description.
  */
-template<typename Dtype>
+template <typename Dtype>
 class Blob {
 	public:
 		Blob()
-			: data_(), diff_(), count_(0), capacity_(0) {
+		:
+				data_(), diff_(), count_(0), capacity_(0) {
 		}
 
 		/// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
 		explicit Blob(const int num, const int channels, const int height,
-			const int width);
+				const int width);
 		explicit Blob(const vector<int>& shape);
 
 		/// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
 		void Reshape(const int num, const int channels, const int height,
-			const int width);
+				const int width);
 		/**
 		 * @brief Change the dimensions of the blob, allocating new memory if
 		 *        necessary.
@@ -125,11 +126,11 @@ class Blob {
 		 */
 		inline int CanonicalAxisIndex(int axis_index) const {
 			CHECK_GE(axis_index, -num_axes())
-				<< "axis " << axis_index << " out of range for " << num_axes()
-				<< "-D Blob with shape " << shape_string();
+					<< "axis " << axis_index << " out of range for " << num_axes()
+					<< "-D Blob with shape " << shape_string();
 			CHECK_LT(axis_index, num_axes())
-				<< "axis " << axis_index << " out of range for " << num_axes()
-				<< "-D Blob with shape " << shape_string();
+					<< "axis " << axis_index << " out of range for " << num_axes()
+					<< "-D Blob with shape " << shape_string();
 			if (axis_index < 0) {
 				return axis_index + num_axes();
 			}
@@ -154,7 +155,7 @@ class Blob {
 		}
 		inline int LegacyShape(int index) const {
 			CHECK_LE(num_axes(), 4)
-				<< "Cannot use legacy accessors on Blobs with > 4 axes.";
+					<< "Cannot use legacy accessors on Blobs with > 4 axes.";
 			CHECK_LT(index, 4);
 			CHECK_GE(index, -4);
 			if (index >= num_axes() || index < -num_axes()) {
@@ -167,7 +168,7 @@ class Blob {
 		}
 
 		inline int offset(const int n, const int c = 0, const int h = 0,
-			const int w = 0) const {
+				const int w = 0) const {
 			CHECK_GE(n, 0);
 			CHECK_LE(n, num());
 			CHECK_GE(channels(), 0);
@@ -202,15 +203,15 @@ class Blob {
 		 *        shape if necessary
 		 */
 		void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
-			bool reshape = false);
+				bool reshape = false);
 
 		inline Dtype data_at(const int n, const int c, const int h,
-			const int w) const {
+				const int w) const {
 			return cpu_data()[offset(n, c, h, w)];
 		}
 
 		inline Dtype diff_at(const int n, const int c, const int h,
-			const int w) const {
+				const int w) const {
 			return cpu_diff()[offset(n, c, h, w)];
 		}
 
@@ -282,7 +283,6 @@ class Blob {
 			data_->set_data_layer();
 			diff_->set_data_layer();
 		}
-		;
 
 		bool ShapeEquals(const BlobProto& other);
 
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 879e84e7..d892b5b5 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -25,7 +25,7 @@ namespace caffe {
  *
  * NOTE: does not implement Backwards operation.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ArgMaxLayer: public Layer<Dtype> {
 	public:
 		/**
@@ -37,12 +37,13 @@ class ArgMaxLayer: public Layer<Dtype> {
 		 *     if set, output a vector of pairs (max_ind, max_val) for each image.
 		 */
 		explicit ArgMaxLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "ArgMax";
@@ -67,10 +68,11 @@ class ArgMaxLayer: public Layer<Dtype> {
 		 *      @f$ (for @f$ K = 1 @f$).
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		/// @brief Not implemented (non-differentiable function)
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 			NOT_IMPLEMENTED;
 		}
 		bool out_max_val_;
@@ -81,16 +83,17 @@ class ArgMaxLayer: public Layer<Dtype> {
  * @brief Takes at least two Blob%s and concatenates them along either the num
  *        or channel dimension, outputting the result.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ConcatLayer: public Layer<Dtype> {
 	public:
 		explicit ConcatLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Concat";
@@ -120,9 +123,9 @@ class ConcatLayer: public Layer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -147,9 +150,9 @@ class ConcatLayer: public Layer<Dtype> {
 		 *        @f$
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int count_;
 		int num_concats_;
@@ -163,16 +166,17 @@ class ConcatLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class EltwiseLayer: public Layer<Dtype> {
 	public:
 		explicit EltwiseLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Eltwise";
@@ -186,13 +190,13 @@ class EltwiseLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		EltwiseParameter_EltwiseOp op_;
 		vector<Dtype> coeffs_;
@@ -207,16 +211,17 @@ class EltwiseLayer: public Layer<Dtype> {
  * the corresponding item has to be filtered, non-zero means that corresponding
  * item needs to stay).
  */
-template<typename Dtype>
+template <typename Dtype>
 class FilterLayer: public Layer<Dtype> {
 	public:
 		explicit FilterLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Filter";
@@ -249,9 +254,9 @@ class FilterLayer: public Layer<Dtype> {
 		 *        that haven't been filtered
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the forwarded inputs.
@@ -263,9 +268,9 @@ class FilterLayer: public Layer<Dtype> {
 		 *        gradient is copied
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		bool first_reshape_;
 		vector<int> indices_to_forward_;
@@ -281,14 +286,15 @@ class FilterLayer: public Layer<Dtype> {
  * and in Backward, the diff pointer of the bottom Blob to that of the top Blob
  * (see Blob::ShareDiff).
  */
-template<typename Dtype>
+template <typename Dtype>
 class FlattenLayer: public Layer<Dtype> {
 	public:
 		explicit FlattenLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Flatten";
@@ -310,7 +316,7 @@ class FlattenLayer: public Layer<Dtype> {
 		 *      the outputs -- i.e., the (virtually) copied, flattened inputs
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -322,7 +328,7 @@ class FlattenLayer: public Layer<Dtype> {
 		 *        gradient is (virtually) copied
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -331,16 +337,17 @@ class FlattenLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class InnerProductLayer: public Layer<Dtype> {
 	public:
 		explicit InnerProductLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "InnerProduct";
@@ -354,13 +361,13 @@ class InnerProductLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int M_;
 		int K_;
@@ -374,14 +381,15 @@ class InnerProductLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class MVNLayer: public Layer<Dtype> {
 	public:
 		explicit MVNLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "MVN";
@@ -395,13 +403,13 @@ class MVNLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Blob<Dtype> mean_, variance_, temp_;
 
@@ -416,16 +424,17 @@ class MVNLayer: public Layer<Dtype> {
  * Note: similarly to FlattenLayer, this layer does not change the input values
  * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff).
  */
-template<typename Dtype>
+template <typename Dtype>
 class ReshapeLayer: public Layer<Dtype> {
 	public:
 		explicit ReshapeLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Reshape";
@@ -439,16 +448,18 @@ class ReshapeLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 
 		/// @brief vector of axes indices whose dimensions we'll copy from the bottom
@@ -466,16 +477,17 @@ class ReshapeLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ReductionLayer: public Layer<Dtype> {
 	public:
 		explicit ReductionLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Reduction";
@@ -489,13 +501,13 @@ class ReductionLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		/// @brief the reduction operation performed by the layer
 		ReductionParameter_ReductionOp op_;
@@ -515,14 +527,15 @@ class ReductionLayer: public Layer<Dtype> {
  * @brief Ignores bottom blobs while producing no top blobs. (This is useful
  *        to suppress outputs during testing.)
  */
-template<typename Dtype>
+template <typename Dtype>
 class SilenceLayer: public Layer<Dtype> {
 	public:
 		explicit SilenceLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		virtual inline const char* type() const {
@@ -537,16 +550,16 @@ class SilenceLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 		// We can't define Forward_gpu here, since STUB_GPU will provide
 		// its own definition for CPU_ONLY mode.
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -554,15 +567,16 @@ class SilenceLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SoftmaxLayer: public Layer<Dtype> {
 	public:
 		explicit SoftmaxLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		~SoftmaxLayer();
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Softmax";
@@ -576,13 +590,13 @@ class SoftmaxLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int outer_num_;
 		int inner_num_;
@@ -604,16 +618,16 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
 	explicit CuDNNSoftmaxLayer(const LayerParameter& param)
 	: SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNSoftmaxLayer();
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t handle_;
@@ -628,14 +642,15 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SplitLayer: public Layer<Dtype> {
 	public:
 		explicit SplitLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Split";
@@ -649,13 +664,13 @@ class SplitLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int count_;
 		cl_kernel gpu_add_kernel;
@@ -667,16 +682,17 @@ class SplitLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SliceLayer: public Layer<Dtype> {
 	public:
 		explicit SliceLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Slice";
@@ -690,13 +706,13 @@ class SliceLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int count_;
 		int num_slices_;
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 442e4009..e93c4fe8 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -24,7 +24,7 @@ namespace caffe {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class BaseDataLayer: public Layer<Dtype> {
 	public:
 		explicit BaseDataLayer(const LayerParameter& param);
@@ -32,20 +32,22 @@ class BaseDataLayer: public Layer<Dtype> {
 		// DataLayerSetUp to do special data layer setup for individual layer types.
 		// This method may not be overridden except by the BasePrefetchingDataLayer.
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 		// Data layers have no bottoms, so reshaping is trivial.
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 
 	protected:
@@ -54,23 +56,24 @@ class BaseDataLayer: public Layer<Dtype> {
 		bool output_labels_;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class BasePrefetchingDataLayer:
-	public BaseDataLayer<Dtype>, public InternalThread {
+		public BaseDataLayer<Dtype>, public InternalThread {
 	public:
 		explicit BasePrefetchingDataLayer(const LayerParameter& param)
-			: BaseDataLayer<Dtype>(param) {
+		:
+				BaseDataLayer<Dtype>(param) {
 		}
 		// LayerSetUp: implements common data layer setup functionality, and calls
 		// DataLayerSetUp to do special data layer setup for individual layer types.
 		// This method may not be overridden.
 		void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual void CreatePrefetchThread();
 		virtual void JoinPrefetchThread();
@@ -84,15 +87,16 @@ class BasePrefetchingDataLayer:
 		Blob<Dtype> transformed_data_;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class DataLayer: public BasePrefetchingDataLayer<Dtype> {
 	public:
 		explicit DataLayer(const LayerParameter& param)
-			: BasePrefetchingDataLayer<Dtype>(param) {
+		:
+				BasePrefetchingDataLayer<Dtype>(param) {
 		}
 		virtual ~DataLayer();
 		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Data";
@@ -119,17 +123,18 @@ class DataLayer: public BasePrefetchingDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class DummyDataLayer: public Layer<Dtype> {
 	public:
 		explicit DummyDataLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		// Data layers have no bottoms, so reshaping is trivial.
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		virtual inline const char* type() const {
@@ -144,12 +149,14 @@ class DummyDataLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 
 		vector<shared_ptr<Filler<Dtype> > > fillers_;
@@ -161,18 +168,19 @@ class DummyDataLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class HDF5DataLayer: public Layer<Dtype> {
 	public:
 		explicit HDF5DataLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual ~HDF5DataLayer();
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		// Data layers have no bottoms, so reshaping is trivial.
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		virtual inline const char* type() const {
@@ -187,14 +195,16 @@ class HDF5DataLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 		}
 		virtual void LoadHDF5FileData(const char* filename);
 
@@ -212,18 +222,19 @@ class HDF5DataLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class HDF5OutputLayer: public Layer<Dtype> {
 	public:
 		explicit HDF5OutputLayer(const LayerParameter& param)
-			: Layer<Dtype>(param), file_opened_(false) {
+		:
+				Layer<Dtype>(param), file_opened_(false) {
 		}
 		virtual ~HDF5OutputLayer();
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		// Data layers have no bottoms, so reshaping is trivial.
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		virtual inline const char* type() const {
@@ -243,13 +254,13 @@ class HDF5OutputLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void SaveBlobs();
 
 		bool file_opened_;
@@ -264,15 +275,16 @@ class HDF5OutputLayer: public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
 	public:
 		explicit ImageDataLayer(const LayerParameter& param)
-			: BasePrefetchingDataLayer<Dtype>(param) {
+		:
+				BasePrefetchingDataLayer<Dtype>(param) {
 		}
 		virtual ~ImageDataLayer();
 		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "ImageData";
@@ -298,14 +310,15 @@ class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class MemoryDataLayer: public BaseDataLayer<Dtype> {
 	public:
 		explicit MemoryDataLayer(const LayerParameter& param)
-			: BaseDataLayer<Dtype>(param), has_new_data_(false) {
+		:
+				BaseDataLayer<Dtype>(param), has_new_data_(false) {
 		}
 		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "MemoryData";
@@ -319,7 +332,7 @@ class MemoryDataLayer: public BaseDataLayer<Dtype> {
 
 		virtual void AddDatumVector(const vector<Datum>& datum_vector);
 		virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
-			const vector<int>& labels);
+				const vector<int>& labels);
 
 		// Reset should accept const pointers, but can't, because the memory
 		//  will be given to Blob, which is mutable
@@ -341,7 +354,7 @@ class MemoryDataLayer: public BaseDataLayer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		int batch_size_, channels_, height_, width_, size_;
 		Dtype* data_;
@@ -359,15 +372,16 @@ class MemoryDataLayer: public BaseDataLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class WindowDataLayer: public BasePrefetchingDataLayer<Dtype> {
 	public:
 		explicit WindowDataLayer(const LayerParameter& param)
-			: BasePrefetchingDataLayer<Dtype>(param) {
+		:
+				BasePrefetchingDataLayer<Dtype>(param) {
 		}
 		virtual ~WindowDataLayer();
 		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "WindowData";
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 94c32366..c283a244 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -13,7 +13,7 @@ namespace caffe {
  * @brief Applies common transformations to the input data, such as
  * scaling, mirroring, substracting the image mean...
  */
-template<typename Dtype>
+template <typename Dtype>
 class DataTransformer {
 	public:
 		explicit DataTransformer(const TransformationParameter& param, Phase phase);
@@ -49,7 +49,7 @@ class DataTransformer {
 		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
 		 */
 		void Transform(const vector<Datum> & datum_vector,
-			Blob<Dtype>* transformed_blob);
+				Blob<Dtype>* transformed_blob);
 
 		/**
 		 * @brief Applies the transformation defined in the data layer's
@@ -62,7 +62,7 @@ class DataTransformer {
 		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
 		 */
 		void Transform(const vector<cv::Mat> & mat_vector,
-			Blob<Dtype>* transformed_blob);
+				Blob<Dtype>* transformed_blob);
 
 		/**
 		 * @brief Applies the transformation defined in the data layer's
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index c6cefedc..2d71b333 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -35,7 +35,8 @@ namespace caffe {
 class Device {
 	public:
 		Device()
-			: numPlatforms(0), numDevices(0), device_id(INT_MIN) {
+		:
+				numPlatforms(0), numDevices(0), device_id(INT_MIN) {
 		}
 		~Device();
 		cl_uint numPlatforms;
@@ -69,10 +70,10 @@ class Device {
 		;
 		void BuildProgram(std::string kernel_dir);
 
-		template<typename T>
+		template <typename T>
 		void DisplayDeviceInfo(cl_device_id id, cl_device_info name,
-			std::string str);
-		template<typename T>
+				std::string str);
+		template <typename T>
 		void appendBitfield(T info, T value, std::string name, std::string &str);
 
 		cl_kernel GetKernel(std::string kernel_name);
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 6c47d7aa..c431dc94 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -16,11 +16,12 @@
 namespace caffe {
 
 /// @brief Fills a Blob with constant or randomly-generated data.
-template<typename Dtype>
+template <typename Dtype>
 class Filler {
 	public:
 		explicit Filler(const FillerParameter& param)
-			: filler_param_(param) {
+		:
+				filler_param_(param) {
 		}
 		virtual ~Filler() {
 		}
@@ -31,11 +32,12 @@ class Filler {
 // class Filler
 
 /// @brief Fills a Blob with constant values @f$ x = 0 @f$.
-template<typename Dtype>
+template <typename Dtype>
 class ConstantFiller: public Filler<Dtype> {
 	public:
 		explicit ConstantFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			Dtype* data = blob->mutable_cpu_data();
@@ -46,39 +48,41 @@ class ConstantFiller: public Filler<Dtype> {
 				data[i] = value;
 			}
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
 /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
-template<typename Dtype>
+template <typename Dtype>
 class UniformFiller: public Filler<Dtype> {
 	public:
 		explicit UniformFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			CHECK(blob->count());
 			caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
-				Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+					Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
 /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
-template<typename Dtype>
+template <typename Dtype>
 class GaussianFiller: public Filler<Dtype> {
 	public:
 		explicit GaussianFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			Dtype* data = blob->mutable_cpu_data();
 			CHECK(blob->count());
 			caffe_rng_gaussian<Dtype>(blob->count(),
-				Dtype(this->filler_param_.mean()),
-				Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
+					Dtype(this->filler_param_.mean()),
+					Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
 			int sparse = this->filler_param_.sparse();
 			CHECK_GE(sparse, -1);
 			if (sparse >= 0) {
@@ -105,11 +109,12 @@ class GaussianFiller: public Filler<Dtype> {
 /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
  *         such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
  */
-template<typename Dtype>
+template <typename Dtype>
 class PositiveUnitballFiller: public Filler<Dtype> {
 	public:
 		explicit PositiveUnitballFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			Dtype* data = blob->mutable_cpu_data();
@@ -129,7 +134,7 @@ class PositiveUnitballFiller: public Filler<Dtype> {
 				}
 			}
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
@@ -149,11 +154,12 @@ class PositiveUnitballFiller: public Filler<Dtype> {
  *
  * TODO(dox): make notation in above comment consistent with rest & use LaTeX.
  */
-template<typename Dtype>
+template <typename Dtype>
 class XavierFiller: public Filler<Dtype> {
 	public:
 		explicit XavierFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			CHECK(blob->count());
@@ -161,17 +167,17 @@ class XavierFiller: public Filler<Dtype> {
 			int fan_out = blob->count() / blob->channels();
 			Dtype n = fan_in;  // default to fan_in
 			if (this->filler_param_.variance_norm() ==
-				FillerParameter_VarianceNorm_AVERAGE) {
+					FillerParameter_VarianceNorm_AVERAGE) {
 				n = (fan_in + fan_out) / Dtype(2);
 			} else if (this->filler_param_.variance_norm() ==
-				FillerParameter_VarianceNorm_FAN_OUT) {
+					FillerParameter_VarianceNorm_FAN_OUT) {
 				n = fan_out;
 			}
 			Dtype scale = sqrt(Dtype(3) / n);
 			caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
-				blob->mutable_cpu_data());
+					blob->mutable_cpu_data());
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
@@ -192,11 +198,12 @@ class XavierFiller: public Filler<Dtype> {
  * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this
  * is currently not the case for inner product layers.
  */
-template<typename Dtype>
+template <typename Dtype>
 class MSRAFiller: public Filler<Dtype> {
 	public:
 		explicit MSRAFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			CHECK(blob->count());
@@ -204,17 +211,17 @@ class MSRAFiller: public Filler<Dtype> {
 			int fan_out = blob->count() / blob->channels();
 			Dtype n = fan_in;  // default to fan_in
 			if (this->filler_param_.variance_norm() ==
-				FillerParameter_VarianceNorm_AVERAGE) {
+					FillerParameter_VarianceNorm_AVERAGE) {
 				n = (fan_in + fan_out) / Dtype(2);
 			} else if (this->filler_param_.variance_norm() ==
-				FillerParameter_VarianceNorm_FAN_OUT) {
+					FillerParameter_VarianceNorm_FAN_OUT) {
 				n = fan_out;
 			}
 			Dtype std = sqrt(Dtype(2) / n);
 			caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
-				blob->mutable_cpu_data());
+					blob->mutable_cpu_data());
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
@@ -251,11 +258,12 @@ class MSRAFiller: public Filler<Dtype> {
  out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
  \endcode
  */
-template<typename Dtype>
+template <typename Dtype>
 class BilinearFiller: public Filler<Dtype> {
 	public:
 		explicit BilinearFiller(const FillerParameter& param)
-			: Filler<Dtype>(param) {
+		:
+				Filler<Dtype>(param) {
 		}
 		virtual void Fill(Blob<Dtype>* blob) {
 			CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
@@ -269,7 +277,7 @@ class BilinearFiller: public Filler<Dtype> {
 				data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
 			}
 			CHECK_EQ(this->filler_param_.sparse(), -1)
-				<< "Sparsity not supported by this Filler.";
+					<< "Sparsity not supported by this Filler.";
 		}
 };
 
@@ -279,7 +287,7 @@ class BilinearFiller: public Filler<Dtype> {
  * Ideally this would be replaced by a factory pattern, but we will leave it
  * this way for now.
  */
-template<typename Dtype>
+template <typename Dtype>
 Filler<Dtype>* GetFiller(const FillerParameter& param) {
 	const std::string& type = param.type();
 	if (type == "constant") {
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 2df1806e..677deea4 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -21,7 +21,8 @@ namespace caffe {
 class InternalThread {
 	public:
 		InternalThread()
-			: thread_() {
+		:
+				thread_() {
 		}
 		virtual ~InternalThread();
 
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index b01ea959..5651e814 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -23,7 +23,7 @@ namespace caffe {
  * gradients with respect to their input Blob%s, given the error gradients with
  * their output Blob%s.
  */
-template<typename Dtype>
+template <typename Dtype>
 class Layer {
 	public:
 		/**
@@ -32,7 +32,8 @@ class Layer {
 		 * layer.
 		 */
 		explicit Layer(const LayerParameter& param)
-			: layer_param_(param) {
+		:
+				layer_param_(param) {
 			// Set phase and copy blobs (if there are any).
 			phase_ = param.phase();
 			if (layer_param_.blobs_size() > 0) {
@@ -60,7 +61,7 @@ class Layer {
 		 * This method may not be overridden.
 		 */
 		void SetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			CheckBlobCounts(bottom, top);
 			LayerSetUp(bottom, top);
 			Reshape(bottom, top);
@@ -84,7 +85,7 @@ class Layer {
 		 * adjust the top blob sizes.
 		 */
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 		}
 
 		/**
@@ -100,7 +101,7 @@ class Layer {
 		 * accomodate the bottom blobs.
 		 */
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) = 0;
+				const vector<Blob<Dtype>*>& top) = 0;
 
 		/**
 		 * @brief Given the bottom blobs, compute the top blobs and the loss.
@@ -120,7 +121,7 @@ class Layer {
 		 * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
 		 */
 		inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Given the top blob error gradients, compute the bottom blob error
@@ -144,8 +145,8 @@ class Layer {
 		 * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
 		 */
 		inline void Backward(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down,
-			const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom);
 
 		/**
 		 * @brief Returns the vector of learnable parameter blobs.
@@ -294,7 +295,7 @@ class Layer {
 		 */
 		inline bool param_propagate_down(const int param_id) {
 			return
-				(param_propagate_down_.size() > param_id) ?
+			(param_propagate_down_.size() > param_id) ?
 					param_propagate_down_[param_id] : false;
 		}
 		/**
@@ -324,13 +325,13 @@ class Layer {
 
 		/** @brief Using the CPU device, compute the layer output. */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) = 0;
+				const vector<Blob<Dtype>*>& top) = 0;
 		/**
 		 * @brief Using the GPU device, compute the layer output.
 		 *        Fall back to Forward_cpu() if unavailable.
 		 */
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			// LOG(WARNING) << "Using CPU code as backup.";
 			return Forward_cpu(bottom, top);
 		}
@@ -340,16 +341,16 @@ class Layer {
 		 *        for the bottom blobs if propagate_down is true.
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down,
-			const vector<Blob<Dtype>*>& bottom) = 0;
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) = 0;
 		/**
 		 * @brief Using the GPU device, compute the gradients for any parameters and
 		 *        for the bottom blobs if propagate_down is true.
 		 *        Fall back to Backward_cpu() if unavailable.
 		 */
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down,
-			const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 			// LOG(WARNING) << "Using CPU code as backup.";
 			Backward_cpu(top, propagate_down, bottom);
 		}
@@ -360,41 +361,41 @@ class Layer {
 		 * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
 		 */
 		virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			if (ExactNumBottomBlobs() >= 0) {
 				CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
-					<< type() << " Layer takes " << ExactNumBottomBlobs()
-					<< " bottom blob(s) as input.";
+						<< type() << " Layer takes " << ExactNumBottomBlobs()
+						<< " bottom blob(s) as input.";
 			}
 			if (MinBottomBlobs() >= 0) {
 				CHECK_LE(MinBottomBlobs(), bottom.size())
-					<< type() << " Layer takes at least " << MinBottomBlobs()
-					<< " bottom blob(s) as input.";
+						<< type() << " Layer takes at least " << MinBottomBlobs()
+						<< " bottom blob(s) as input.";
 			}
 			if (MaxBottomBlobs() >= 0) {
 				CHECK_GE(MaxBottomBlobs(), bottom.size())
-					<< type() << " Layer takes at most " << MaxBottomBlobs()
-					<< " bottom blob(s) as input.";
+						<< type() << " Layer takes at most " << MaxBottomBlobs()
+						<< " bottom blob(s) as input.";
 			}
 			if (ExactNumTopBlobs() >= 0) {
 				CHECK_EQ(ExactNumTopBlobs(), top.size())
-					<< type() << " Layer produces " << ExactNumTopBlobs()
-					<< " top blob(s) as output.";
+						<< type() << " Layer produces " << ExactNumTopBlobs()
+						<< " top blob(s) as output.";
 			}
 			if (MinTopBlobs() >= 0) {
 				CHECK_LE(MinTopBlobs(), top.size())
-					<< type() << " Layer produces at least " << MinTopBlobs()
-					<< " top blob(s) as output.";
+						<< type() << " Layer produces at least " << MinTopBlobs()
+						<< " top blob(s) as output.";
 			}
 			if (MaxTopBlobs() >= 0) {
 				CHECK_GE(MaxTopBlobs(), top.size())
-					<< type() << " Layer produces at most " << MaxTopBlobs()
-					<< " top blob(s) as output.";
+						<< type() << " Layer produces at most " << MaxTopBlobs()
+						<< " top blob(s) as output.";
 			}
 			if (EqualNumBottomTopBlobs()) {
 				CHECK_EQ(bottom.size(), top.size())
-					<< type() << " Layer produces one top blob as output for each "
-					<< "bottom blob input.";
+						<< type() << " Layer produces one top blob as output for each "
+						<< "bottom blob input.";
 			}
 		}
 
@@ -406,7 +407,7 @@ class Layer {
 			const int num_loss_weights = layer_param_.loss_weight_size();
 			if (num_loss_weights) {
 				CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
-					"unspecified or specified once per top blob.";
+						"unspecified or specified once per top blob.";
 				for (int top_id = 0; top_id < top.size(); ++top_id) {
 					const Dtype loss_weight = layer_param_.loss_weight(top_id);
 					if (loss_weight == Dtype(0)) {
@@ -427,9 +428,9 @@ class Layer {
 // Forward and backward wrappers. You should implement the cpu and
 // gpu specific implementations instead, and should not change these
 // functions.
-template<typename Dtype>
+template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	Dtype loss = 0;
 	Reshape(bottom, top);
 	switch (Caffe::mode()) {
@@ -467,10 +468,10 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
 	return loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	switch (Caffe::mode()) {
 		case Caffe::CPU:
 			Backward_cpu(top, propagate_down, bottom);
@@ -484,7 +485,7 @@ inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
 }
 
 // Serialize LayerParameter to protocol buffer
-template<typename Dtype>
+template <typename Dtype>
 void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
 	param->Clear();
 	param->CopyFrom(layer_param_);
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index e679ae6a..b64b9eb2 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -47,10 +47,10 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 class Layer;
 
-template<typename Dtype>
+template <typename Dtype>
 class LayerRegistry {
 	public:
 		typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
@@ -65,7 +65,7 @@ class LayerRegistry {
 		static void AddCreator(const string& type, Creator creator) {
 			CreatorRegistry& registry = Registry();
 			CHECK_EQ(registry.count(type), 0)
-				<< "Layer type " << type << " already registered.";
+					<< "Layer type " << type << " already registered.";
 			registry[type] = creator;
 		}
 
@@ -75,7 +75,7 @@ class LayerRegistry {
 			const string& type = param.type();
 			CreatorRegistry& registry = Registry();
 			CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
-				<< " (known types: " << LayerTypeList() << ")";
+					<< " (known types: " << LayerTypeList() << ")";
 			return registry[type](param);
 		}
 
@@ -89,7 +89,7 @@ class LayerRegistry {
 			CreatorRegistry& registry = Registry();
 			string layer_types;
 			for (typename CreatorRegistry::iterator iter = registry.begin();
-				iter != registry.end(); ++iter) {
+					iter != registry.end(); ++iter) {
 				if (iter != registry.begin()) {
 					layer_types += ", ";
 				}
@@ -99,11 +99,11 @@ class LayerRegistry {
 		}
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class LayerRegisterer {
 	public:
 		LayerRegisterer(const string& type,
-			shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
+				shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
 			// LOG(INFO) << "Registering layer type: " << type;
 			LayerRegistry<Dtype>::AddCreator(type, creator);
 		}
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 9e74ca85..766645b5 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -19,7 +19,7 @@ const float kLOG_THRESHOLD = 1e-20;
  * @brief Computes the classification accuracy for a one-of-many
  *        classification task.
  */
-template<typename Dtype>
+template <typename Dtype>
 class AccuracyLayer: public Layer<Dtype> {
 	public:
 		/**
@@ -31,12 +31,13 @@ class AccuracyLayer: public Layer<Dtype> {
 		 *     correct if the correct label is among the top 5 predicted labels.
 		 */
 		explicit AccuracyLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Accuracy";
@@ -74,11 +75,12 @@ class AccuracyLayer: public Layer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 			for (int i = 0; i < propagate_down.size(); ++i) {
 				if (propagate_down[i]) {
 					NOT_IMPLEMENTED;
@@ -104,16 +106,17 @@ class AccuracyLayer: public Layer<Dtype> {
  * LossLayers are typically only capable of backpropagating to their first input
  * -- the predictions.
  */
-template<typename Dtype>
+template <typename Dtype>
 class LossLayer: public Layer<Dtype> {
 	public:
 		explicit LossLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(
-			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(
-			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
 		virtual inline int ExactNumBottomBlobs() const {
 			return 2;
@@ -164,14 +167,15 @@ class LossLayer: public Layer<Dtype> {
  *          d = \left| \left| a_n - b_n \right| \right|_2^2 @f$.
  * This can be used to train siamese networks.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ContrastiveLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit ContrastiveLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param), diff_() {
+		:
+				LossLayer<Dtype>(param), diff_() {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline int ExactNumBottomBlobs() const {
 			return 3;
@@ -190,9 +194,9 @@ class ContrastiveLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc ContrastiveLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the Contrastive error gradient w.r.t. the inputs.
@@ -220,9 +224,9 @@ class ContrastiveLossLayer: public LossLayer<Dtype> {
 		 *      propagate_down[1]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Blob<Dtype> diff_;  // cached for backward pass
 		Blob<Dtype> dist_sq_;  // cached for backward pass
@@ -256,14 +260,15 @@ class ContrastiveLossLayer: public LossLayer<Dtype> {
  * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve
  * linear least squares problems! We use it only as an instructive example.)
  */
-template<typename Dtype>
+template <typename Dtype>
 class EuclideanLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit EuclideanLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param), diff_() {
+		:
+				LossLayer<Dtype>(param), diff_() {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "EuclideanLoss";
@@ -279,9 +284,9 @@ class EuclideanLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc EuclideanLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the Euclidean error gradient w.r.t. the inputs.
@@ -317,9 +322,9 @@ class EuclideanLossLayer: public LossLayer<Dtype> {
 		 *      @f$ if propagate_down[1]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Blob<Dtype> diff_;
 };
@@ -367,11 +372,12 @@ class EuclideanLossLayer: public LossLayer<Dtype> {
  * outside the InnerProductLayer and no other losses outside the
  * HingeLossLayer).
  */
-template<typename Dtype>
+template <typename Dtype>
 class HingeLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit HingeLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param) {
+		:
+				LossLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -381,7 +387,7 @@ class HingeLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc HingeLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the hinge loss error gradient w.r.t. the predictions.
@@ -411,7 +417,7 @@ class HingeLossLayer: public LossLayer<Dtype> {
 		 *      the labels -- ignored as we can't compute their error gradients
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -446,16 +452,17 @@ class HingeLossLayer: public LossLayer<Dtype> {
  *        \log(\hat{p}_{n,k})
  *      @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$.
  */
-template<typename Dtype>
+template <typename Dtype>
 class InfogainLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit InfogainLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param), infogain_() {
+		:
+				LossLayer<Dtype>(param), infogain_() {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		// InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
 		// be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
@@ -477,7 +484,7 @@ class InfogainLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc InfogainLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the infogain loss error gradient w.r.t. the predictions.
@@ -512,7 +519,7 @@ class InfogainLossLayer: public LossLayer<Dtype> {
 		 *      gradient computation is not implemented.
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Blob<Dtype> infogain_;
 };
@@ -546,14 +553,15 @@ class InfogainLossLayer: public LossLayer<Dtype> {
  *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
  *      @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit MultinomialLogisticLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param) {
+		:
+				LossLayer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "MultinomialLogisticLoss";
@@ -562,7 +570,7 @@ class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc MultinomialLogisticLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the multinomial logistic loss error gradient w.r.t. the
@@ -593,7 +601,7 @@ class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
 		 *      the labels -- ignored as we can't compute their error gradients
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -625,18 +633,19 @@ class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
  *              \right]
  *      @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
 	public:
 		explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param),
-				sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
-				sigmoid_output_(new Blob<Dtype>()) {
+		:
+				LossLayer<Dtype>(param),
+						sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
+						sigmoid_output_(new Blob<Dtype>()) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "SigmoidCrossEntropyLoss";
@@ -645,7 +654,7 @@ class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc SigmoidCrossEntropyLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
@@ -678,9 +687,9 @@ class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
 		 *      the labels -- ignored as we can't compute their error gradients
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		/// The internal SigmoidLayer used to map predictions to probabilities.
 		shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
@@ -693,7 +702,7 @@ class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
 };
 
 // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer.
-template<typename Dtype> class SoftmaxLayer;
+template <typename Dtype> class SoftmaxLayer;
 
 /**
  * @brief Computes the multinomial logistic loss for a one-of-many
@@ -724,7 +733,7 @@ template<typename Dtype> class SoftmaxLayer;
  *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
  *      @f$, for softmax output class probabilites @f$ \hat{p} @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class SoftmaxWithLossLayer: public LossLayer<Dtype> {
 	public:
 		/**
@@ -736,13 +745,14 @@ class SoftmaxWithLossLayer: public LossLayer<Dtype> {
 		 *    present; otherwise the loss is simply summed over spatial locations.
 		 */
 		explicit SoftmaxWithLossLayer(const LayerParameter& param)
-			: LossLayer<Dtype>(param) {
+		:
+				LossLayer<Dtype>(param) {
 		}
 		~SoftmaxWithLossLayer();
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "SoftmaxWithLoss";
@@ -760,9 +770,9 @@ class SoftmaxWithLossLayer: public LossLayer<Dtype> {
 	protected:
 		/// @copydoc SoftmaxWithLossLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		/**
 		 * @brief Computes the softmax loss error gradient w.r.t. the predictions.
 		 *
@@ -791,9 +801,9 @@ class SoftmaxWithLossLayer: public LossLayer<Dtype> {
 		 *      the labels -- ignored as we can't compute their error gradients
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		void ocl_setup();
 
 		/// The internal SoftmaxLayer used to map predictions to a distribution.
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 68e631a1..2fe273f5 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -20,7 +20,7 @@ namespace caffe {
  *
  * TODO(dox): more thorough description.
  */
-template<typename Dtype>
+template <typename Dtype>
 class Net {
 	public:
 		explicit Net(const NetParameter& param);
@@ -51,7 +51,7 @@ class Net {
 		Dtype ForwardTo(int end);
 		/// @brief Run forward using a set of bottom blobs, and return the result.
 		const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>*> & bottom,
-			Dtype* loss = NULL);
+				Dtype* loss = NULL);
 		/**
 		 * @brief Run forward using a serialized BlobProtoVector and return the
 		 *        result as a serialized BlobProtoVector
@@ -189,7 +189,7 @@ class Net {
 		const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
 		bool has_layer(const string& layer_name) const;
 		const shared_ptr<Layer<Dtype> > layer_by_name(
-			const string& layer_name) const;
+				const string& layer_name) const;
 
 		void set_debug_info(const bool value) {
 			debug_info_ = value;
@@ -201,24 +201,24 @@ class Net {
 		 *        phase, level, and stage.
 		 */
 		static void FilterNet(const NetParameter& param,
-			NetParameter* param_filtered);
+				NetParameter* param_filtered);
 		/// @brief return whether NetState state meets NetStateRule rule
 		static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
-			const string& layer_name);
+				const string& layer_name);
 
 	protected:
 		// Helpers for Init.
 		/// @brief Append a new input or top blob to the net.
 		void AppendTop(const NetParameter& param, const int layer_id,
-			const int top_id, set<string>* available_blobs,
-			map<string, int>* blob_name_to_idx);
+				const int top_id, set<string>* available_blobs,
+				map<string, int>* blob_name_to_idx);
 		/// @brief Append a new bottom blob to the net.
 		int AppendBottom(const NetParameter& param, const int layer_id,
-			const int bottom_id, set<string>* available_blobs,
-			map<string, int>* blob_name_to_idx);
+				const int bottom_id, set<string>* available_blobs,
+				map<string, int>* blob_name_to_idx);
 		/// @brief Append a new parameter blob to the net.
 		void AppendParam(const NetParameter& param, const int layer_id,
-			const int param_id);
+				const int param_id);
 
 		/// @brief Helper for displaying debug info in Forward about input Blobs.
 		void InputDebugInfo(const int layer_id);
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 5606ff65..89b6c481 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -22,14 +22,15 @@ namespace caffe {
  *        each element of the output depends only on the corresponding input
  *        element.
  */
-template<typename Dtype>
+template <typename Dtype>
 class NeuronLayer: public Layer<Dtype> {
 	public:
 		explicit NeuronLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline int ExactNumBottomBlobs() const {
 			return 1;
@@ -49,14 +50,15 @@ class NeuronLayer: public Layer<Dtype> {
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the computed outputs @f$ y = |x| @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class AbsValLayer: public NeuronLayer<Dtype> {
 	public:
 		explicit AbsValLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "AbsVal";
@@ -71,9 +73,9 @@ class AbsValLayer: public NeuronLayer<Dtype> {
 	protected:
 		/// @copydoc AbsValLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the absolute value inputs.
@@ -93,9 +95,9 @@ class AbsValLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -115,11 +117,12 @@ class AbsValLayer: public NeuronLayer<Dtype> {
  *         \end{array} \right.
  *      @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class BNLLLayer: public NeuronLayer<Dtype> {
 	public:
 		explicit BNLLLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -129,9 +132,9 @@ class BNLLLayer: public NeuronLayer<Dtype> {
 	protected:
 		/// @copydoc BNLLLayer
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the BNLL inputs.
@@ -150,9 +153,9 @@ class BNLLLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -166,7 +169,7 @@ class BNLLLayer: public NeuronLayer<Dtype> {
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the computed outputs @f$ y = |x| @f$
  */
-template<typename Dtype>
+template <typename Dtype>
 class DropoutLayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -176,12 +179,13 @@ class DropoutLayer: public NeuronLayer<Dtype> {
 		 *     Sets the probability @f$ p @f$ that any given unit is dropped.
 		 */
 		explicit DropoutLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Dropout";
@@ -211,13 +215,13 @@ class DropoutLayer: public NeuronLayer<Dtype> {
 		 *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		/// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
 		Blob<unsigned int> rand_vec_;
@@ -233,7 +237,7 @@ class DropoutLayer: public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and base @f$ \gamma @f$.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ExpLayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -245,10 +249,11 @@ class ExpLayer: public NeuronLayer<Dtype> {
 		 *         the base @f$ \gamma @f$
 		 */
 		explicit ExpLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Exp";
@@ -266,9 +271,9 @@ class ExpLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the exp inputs.
@@ -288,9 +293,9 @@ class ExpLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Dtype inner_scale_, outer_scale_;
 };
@@ -300,7 +305,7 @@ class ExpLayer: public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and base @f$ \gamma @f$.
  */
-template<typename Dtype>
+template <typename Dtype>
 class LogLayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -312,10 +317,11 @@ class LogLayer: public NeuronLayer<Dtype> {
 		 *         the base @f$ \gamma @f$
 		 */
 		explicit LogLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Log";
@@ -333,9 +339,9 @@ class LogLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the exp inputs.
@@ -355,9 +361,9 @@ class LogLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		Dtype base_scale_;
 		Dtype input_scale_, input_shift_;
@@ -369,7 +375,7 @@ class LogLayer: public NeuronLayer<Dtype> {
  *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
  *        and power @f$ \gamma @f$.
  */
-template<typename Dtype>
+template <typename Dtype>
 class PowerLayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -380,10 +386,11 @@ class PowerLayer: public NeuronLayer<Dtype> {
 		 *   - power (\b optional, default 1) the power @f$ \gamma @f$
 		 */
 		explicit PowerLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Power";
@@ -401,9 +408,9 @@ class PowerLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the power inputs.
@@ -426,9 +433,9 @@ class PowerLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		/// @brief @f$ \gamma @f$ from layer_param_.power_param()
 		Dtype power_;
@@ -444,7 +451,7 @@ class PowerLayer: public NeuronLayer<Dtype> {
  * @brief Rectified Linear Unit non-linearity @f$ y = \max(0, x) @f$.
  *        The simple max is fast to compute, and the function does not saturate.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ReLULayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -454,7 +461,8 @@ class ReLULayer: public NeuronLayer<Dtype> {
 		 *     the value @f$ \nu @f$ by which negative values are multiplied.
 		 */
 		explicit ReLULayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual inline const char* type() const {
 			return "ReLU";
@@ -473,9 +481,9 @@ class ReLULayer: public NeuronLayer<Dtype> {
 		 *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the ReLU inputs.
@@ -506,9 +514,9 @@ class ReLULayer: public NeuronLayer<Dtype> {
 		 *      @f$.
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -521,16 +529,16 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
 	explicit CuDNNReLULayer(const LayerParameter& param)
 	: ReLULayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNReLULayer();
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t handle_;
@@ -547,11 +555,12 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
  * Note that the gradient vanishes as the values move away from 0.
  * The ReLULayer is often a better choice for this reason.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SigmoidLayer: public NeuronLayer<Dtype> {
 	public:
 		explicit SigmoidLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -570,9 +579,9 @@ class SigmoidLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
@@ -592,9 +601,9 @@ class SigmoidLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -607,16 +616,16 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
 	explicit CuDNNSigmoidLayer(const LayerParameter& param)
 	: SigmoidLayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNSigmoidLayer();
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t handle_;
@@ -633,11 +642,12 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
  * Note that the gradient vanishes as the values move away from 0.
  * The ReLULayer is often a better choice for this reason.
  */
-template<typename Dtype>
+template <typename Dtype>
 class TanHLayer: public NeuronLayer<Dtype> {
 	public:
 		explicit TanHLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -656,9 +666,9 @@ class TanHLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
@@ -680,9 +690,9 @@ class TanHLayer: public NeuronLayer<Dtype> {
 		 *      @f$ if propagate_down[0]
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -695,16 +705,16 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
 	explicit CuDNNTanHLayer(const LayerParameter& param)
 	: TanHLayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNTanHLayer();
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t handle_;
@@ -717,7 +727,7 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
  * @brief Tests whether the input exceeds a threshold: outputs 1 for inputs
  *        above threshold; 0 otherwise.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ThresholdLayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -727,10 +737,11 @@ class ThresholdLayer: public NeuronLayer<Dtype> {
 		 *     the threshold value @f$ t @f$ to which the input values are compared.
 		 */
 		explicit ThresholdLayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Threshold";
@@ -752,12 +763,13 @@ class ThresholdLayer: public NeuronLayer<Dtype> {
 		 *      @f$
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		/// @brief Not implemented (non-differentiable function)
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 			NOT_IMPLEMENTED;
 		}
 
@@ -772,7 +784,7 @@ class ThresholdLayer: public NeuronLayer<Dtype> {
  *        channels. The number of axes of input blob should be greater than or
  *        equal to 2. The 1st axis (0-based) is seen as channels.
  */
-template<typename Dtype>
+template <typename Dtype>
 class PReLULayer: public NeuronLayer<Dtype> {
 	public:
 		/**
@@ -784,14 +796,15 @@ class PReLULayer: public NeuronLayer<Dtype> {
 		 *     negative slopes are shared across channels.
 		 */
 		explicit PReLULayer(const LayerParameter& param)
-			: NeuronLayer<Dtype>(param) {
+		:
+				NeuronLayer<Dtype>(param) {
 		}
 
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "PReLU";
@@ -809,9 +822,9 @@ class PReLULayer: public NeuronLayer<Dtype> {
 		 *      @f$.
 		 */
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		/**
 		 * @brief Computes the error gradient w.r.t. the PReLU inputs.
@@ -842,9 +855,9 @@ class PReLULayer: public NeuronLayer<Dtype> {
 		 *      @f$.
 		 */
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		bool channel_shared_;
 		Blob<Dtype> multiplier_; // dot multiplier for backward computation of params
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 653f5e36..41e2c21a 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -10,15 +10,16 @@ namespace bp = boost::python;
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 class PythonLayer: public Layer<Dtype> {
 	public:
 		PythonLayer(PyObject* self, const LayerParameter& param)
-			: Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
+		:
+				Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
 		}
 
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			try {
 				self_.attr("setup")(bottom, top);
 			} catch (bp::error_already_set) {
@@ -28,7 +29,7 @@ class PythonLayer: public Layer<Dtype> {
 		}
 
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			try {
 				self_.attr("reshape")(bottom, top);
 			} catch (bp::error_already_set) {
@@ -43,7 +44,7 @@ class PythonLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top) {
+				const vector<Blob<Dtype>*>& top) {
 			try {
 				self_.attr("forward")(bottom, top);
 			} catch (bp::error_already_set) {
@@ -52,7 +53,8 @@ class PythonLayer: public Layer<Dtype> {
 			}
 		}
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+				const vector<bool>& propagate_down,
+				const vector<Blob<Dtype>*>& bottom) {
 			try {
 				self_.attr("backward")(top, propagate_down, bottom);
 			} catch (bp::error_already_set) {
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 688fb99f..60dbc5b0 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -14,7 +14,7 @@ namespace caffe {
  * Requires implementation of ApplyUpdate to compute a parameter update
  * given the current state of the Net parameters.
  */
-template<typename Dtype>
+template <typename Dtype>
 class Solver {
 	public:
 		explicit Solver(const SolverParameter& param);
@@ -78,15 +78,17 @@ class Solver {
  * @brief Optimizes the parameters of a Net using
  *        stochastic gradient descent (SGD) with momentum.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SGDSolver: public Solver<Dtype> {
 	public:
 		explicit SGDSolver(const SolverParameter& param)
-			: Solver<Dtype>(param) {
+		:
+				Solver<Dtype>(param) {
 			PreSolve();
 		}
 		explicit SGDSolver(const string& param_file)
-			: Solver<Dtype>(param_file) {
+		:
+				Solver<Dtype>(param_file) {
 			PreSolve();
 		}
 
@@ -117,14 +119,16 @@ class SGDSolver: public Solver<Dtype> {
 		DISABLE_COPY_AND_ASSIGN (SGDSolver);
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class NesterovSolver: public SGDSolver<Dtype> {
 	public:
 		explicit NesterovSolver(const SolverParameter& param)
-			: SGDSolver<Dtype>(param) {
+		:
+				SGDSolver<Dtype>(param) {
 		}
 		explicit NesterovSolver(const string& param_file)
-			: SGDSolver<Dtype>(param_file) {
+		:
+				SGDSolver<Dtype>(param_file) {
 		}
 
 	protected:
@@ -137,15 +141,17 @@ class NesterovSolver: public SGDSolver<Dtype> {
 		DISABLE_COPY_AND_ASSIGN (NesterovSolver);
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class AdaGradSolver: public SGDSolver<Dtype> {
 	public:
 		explicit AdaGradSolver(const SolverParameter& param)
-			: SGDSolver<Dtype>(param) {
+		:
+				SGDSolver<Dtype>(param) {
 			constructor_sanity_check();
 		}
 		explicit AdaGradSolver(const string& param_file)
-			: SGDSolver<Dtype>(param_file) {
+		:
+				SGDSolver<Dtype>(param_file) {
 			constructor_sanity_check();
 		}
 
@@ -153,7 +159,7 @@ class AdaGradSolver: public SGDSolver<Dtype> {
 		virtual void ComputeUpdateValue(int param_id, Dtype rate);
 		void constructor_sanity_check() {
 			CHECK_EQ(0, this->param_.momentum())
-				<< "Momentum cannot be used with AdaGrad.";
+					<< "Momentum cannot be used with AdaGrad.";
 		}
 
 		void ocl_setup();
@@ -162,7 +168,7 @@ class AdaGradSolver: public SGDSolver<Dtype> {
 		DISABLE_COPY_AND_ASSIGN (AdaGradSolver);
 };
 
-template<typename Dtype>
+template <typename Dtype>
 Solver<Dtype>* GetSolver(const SolverParameter& param) {
 	SolverParameter_SolverType type = param.solver_type();
 
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 0b053a48..1a16c04a 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -66,13 +66,15 @@ inline void CaffeFreeHost(void* ptr) {
 class SyncedMemory {
 	public:
 		SyncedMemory()
-			: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-				own_cpu_data_(false), data_layer_(false) {
+		:
+				cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+						own_cpu_data_(false), data_layer_(false) {
 			ocl_setup();
 		}
 		explicit SyncedMemory(size_t size)
-			: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-				own_cpu_data_(false), data_layer_(false) {
+		:
+				cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+						own_cpu_data_(false), data_layer_(false) {
 			ocl_setup();
 		}
 
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index 4acca743..1ff29356 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -67,29 +67,29 @@ namespace caffe {
 
 		template <typename Dtype>
 		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-			int n, int c, int h, int w,
-			int stride_n, int stride_c, int stride_h, int stride_w) {
+				int n, int c, int h, int w,
+				int stride_n, int stride_c, int stride_h, int stride_w) {
 			CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
-					n, c, h, w, stride_n, stride_c, stride_h, stride_w));
+							n, c, h, w, stride_n, stride_c, stride_h, stride_w));
 		}
 
 		template <typename Dtype>
 		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-			int n, int c, int h, int w) {
+				int n, int c, int h, int w) {
 			const int stride_w = 1;
 			const int stride_h = w * stride_w;
 			const int stride_c = h * stride_h;
 			const int stride_n = c * stride_c;
 			setTensor4dDesc<Dtype>(desc, n, c, h, w,
-				stride_n, stride_c, stride_h, stride_w);
+					stride_n, stride_c, stride_h, stride_w);
 		}
 
 		template <typename Dtype>
 		inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
-			int n, int c, int h, int w) {
+				int n, int c, int h, int w) {
 			CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
 			CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-					n, c, h, w));
+							n, c, h, w));
 		}
 
 		template <typename Dtype>
@@ -99,16 +99,16 @@ namespace caffe {
 
 		template <typename Dtype>
 		inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
-			cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
-			int pad_h, int pad_w, int stride_h, int stride_w) {
+				cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
+				int pad_h, int pad_w, int stride_h, int stride_w) {
 			CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
-					pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+							pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
 		}
 
 		template <typename Dtype>
 		inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
-			PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
-			int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
+				PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
+				int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
 			switch (poolmethod) {
 				case PoolingParameter_PoolMethod_MAX:
 				*mode = CUDNN_POOLING_MAX;
@@ -121,7 +121,7 @@ namespace caffe {
 			}
 			CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
 			CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-					pad_h, pad_w, stride_h, stride_w));
+							pad_h, pad_w, stride_h, stride_w));
 		}
 
 	}  // namespace cudnn
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index d3716de7..c63fdbb0 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -14,7 +14,8 @@ namespace db {
 class LevelDBCursor: public Cursor {
 	public:
 		explicit LevelDBCursor(leveldb::Iterator* iter)
-			: iter_(iter) {
+		:
+				iter_(iter) {
 			SeekToFirst();
 		}
 		~LevelDBCursor() {
@@ -43,7 +44,8 @@ class LevelDBCursor: public Cursor {
 class LevelDBTransaction: public Transaction {
 	public:
 		explicit LevelDBTransaction(leveldb::DB* db)
-			: db_(db) {
+		:
+				db_(db) {
 			CHECK_NOTNULL(db_);
 		}
 		virtual void Put(const string& key, const string& value) {
@@ -52,7 +54,7 @@ class LevelDBTransaction: public Transaction {
 		virtual void Commit() {
 			leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
 			CHECK(status.ok()) << "Failed to write batch to leveldb "
-				<< std::endl << status.ToString();
+					<< std::endl << status.ToString();
 		}
 
 	private:
@@ -65,7 +67,8 @@ class LevelDBTransaction: public Transaction {
 class LevelDB: public DB {
 	public:
 		LevelDB()
-			: db_(NULL) {
+		:
+				db_(NULL) {
 		}
 		virtual ~LevelDB() {
 			Close();
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index 06424c94..68cbb93a 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -17,7 +17,8 @@ inline void MDB_CHECK(int mdb_status) {
 class LMDBCursor: public Cursor {
 	public:
 		explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
-			: mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
+		:
+				mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
 			SeekToFirst();
 		}
 		virtual ~LMDBCursor() {
@@ -32,11 +33,11 @@ class LMDBCursor: public Cursor {
 		}
 		virtual string key() {
 			return string(static_cast<const char*>(mdb_key_.mv_data),
-				mdb_key_.mv_size);
+					mdb_key_.mv_size);
 		}
 		virtual string value() {
 			return string(static_cast<const char*>(mdb_value_.mv_data),
-				mdb_value_.mv_size);
+					mdb_value_.mv_size);
 		}
 		virtual bool valid() {
 			return valid_;
@@ -62,7 +63,8 @@ class LMDBCursor: public Cursor {
 class LMDBTransaction: public Transaction {
 	public:
 		explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
-			: mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
+		:
+				mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
 		}
 		virtual void Put(const string& key, const string& value);
 		virtual void Commit() {
@@ -79,7 +81,8 @@ class LMDBTransaction: public Transaction {
 class LMDB: public DB {
 	public:
 		LMDB()
-			: mdb_env_(NULL) {
+		:
+				mdb_env_(NULL) {
 		}
 		virtual ~LMDB() {
 			Close();
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index fda13567..ee7ea10b 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -29,84 +29,84 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, Dtype* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, Dtype* data_col);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, Dtype* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, Dtype* data_im);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_im, const int img_offset);
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_im, const int img_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_col, const int col_offset);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_col, const int col_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, Dtype* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, Dtype* data_col);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, Dtype* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, Dtype* data_im);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, const int col_offset);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, const int col_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, const int col_offset);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, const int col_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, const int col_offset, int optnum);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, const int col_offset, int optnum);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int psize, const int pad,
-	const int stride, Dtype* data_im, const int img_offset);
+		const int channels,
+		const int height, const int width, const int psize, const int pad,
+		const int stride, Dtype* data_im, const int img_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_im, const int img_offset, int optnum);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_im, const int img_offset, int optnum);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu_ocl(cl_mem data_col, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_im, cl_kernel Kernel);
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_im, cl_kernel Kernel);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu_ocl(cl_mem data_im, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, cl_kernel Kernel);
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, cl_kernel Kernel);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp
index 4c0d0106..c9a40c54 100644
--- a/include/caffe/util/insert_splits.hpp
+++ b/include/caffe/util/insert_splits.hpp
@@ -12,14 +12,14 @@ namespace caffe {
 void InsertSplits(const NetParameter& param, NetParameter* param_split);
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-	const int blob_idx, const int split_count, const float loss_weight,
-	LayerParameter* split_layer_param);
+		const int blob_idx, const int split_count, const float loss_weight,
+		LayerParameter* split_layer_param);
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-	const int blob_idx);
+		const int blob_idx);
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-	const int blob_idx, const int split_idx);
+		const int blob_idx, const int split_idx);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index faef67e3..7bd1d2db 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -39,7 +39,7 @@ inline void MakeTempDir(string* temp_dirname) {
 	strcpy(temp_dirname_cstr, temp_dirname->c_str());
 	char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
 	CHECK(mkdtemp_result != NULL)
-		<< "Failed to create a temporary directory at: " << *temp_dirname;
+			<< "Failed to create a temporary directory at: " << *temp_dirname;
 	*temp_dirname = temp_dirname_cstr;
 	delete[] temp_dirname_cstr;
 }
@@ -74,13 +74,13 @@ inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) {
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const string& filename,
-	Message* proto) {
+		Message* proto) {
 	ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
 }
 
 void WriteProtoToBinaryFile(const Message& proto, const char* filename);
 inline void WriteProtoToBinaryFile(
-	const Message& proto, const string& filename) {
+		const Message& proto, const string& filename) {
 	WriteProtoToBinaryFile(proto, filename.c_str());
 }
 
@@ -91,32 +91,32 @@ inline bool ReadFileToDatum(const string& filename, Datum* datum) {
 }
 
 bool ReadImageToDatum(const string& filename, const int label,
-	const int height, const int width, const bool is_color,
-	const std::string & encoding, Datum* datum);
+		const int height, const int width, const bool is_color,
+		const std::string & encoding, Datum* datum);
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-	const int height, const int width, const bool is_color, Datum* datum) {
+		const int height, const int width, const bool is_color, Datum* datum) {
 	return ReadImageToDatum(filename, label, height, width, is_color,
-		"", datum);
+			"", datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-	const int height, const int width, Datum* datum) {
+		const int height, const int width, Datum* datum) {
 	return ReadImageToDatum(filename, label, height, width, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-	const bool is_color, Datum* datum) {
+		const bool is_color, Datum* datum) {
 	return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-	Datum* datum) {
+		Datum* datum) {
 	return ReadImageToDatum(filename, label, 0, 0, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-	const std::string & encoding, Datum* datum) {
+		const std::string & encoding, Datum* datum) {
 	return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
 }
 
@@ -124,13 +124,13 @@ bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const int height, const int width, const bool is_color);
+		const int height, const int width, const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const int height, const int width);
+		const int height, const int width);
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const bool is_color);
+		const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename);
 
@@ -139,19 +139,19 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
 
-template<typename Dtype>
+template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
-	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-	Blob<Dtype>* blob);
+		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+		Blob<Dtype>* blob);
 
-template<typename Dtype>
+template <typename Dtype>
 void hdf5_load_nd_dataset(
-	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-	Blob<Dtype>* blob);
+		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+		Blob<Dtype>* blob);
 
-template<typename Dtype>
+template <typename Dtype>
 void hdf5_save_nd_dataset(
-	const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
+		const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 0a7fd67f..8a36069a 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -40,80 +40,80 @@ namespace caffe {
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
-template<typename Dtype>
+template <typename Dtype>
 void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-	Dtype* C);
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+		Dtype* C);
 
 // Decaf gpu gemm provides an interface that is almost the same as the cpu
 // gemm function - following the c convention and calling the fortran-order
 // gpu code under the hood.
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-	Dtype* C);
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+		Dtype* C);
 
-template<typename Dtype>
+template <typename Dtype>
 cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
-	const int offB, const Dtype beta,
-	Dtype* C, const int offC);
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+		const int offB, const Dtype beta,
+		Dtype* C, const int offC);
 /*This is Yuan Gao's sgemm_ex*/
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-	Dtype* C, const int offset1, const int offset2, const int offset3);
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+		Dtype* C, const int offset1, const int offset2, const int offset3);
 
-template<typename Dtype>
+template <typename Dtype>
 cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
-	const int offB, const Dtype beta,
-	Dtype* C, const int offC);
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+		const int offB, const Dtype beta,
+		Dtype* C, const int offC);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-	const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-	Dtype* y);
+		const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+		Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
-	const Dtype * x, size_t offx, const Dtype beta, int incx,
-	Dtype* y, size_t offy, int incy);
+		const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
+		const Dtype * x, size_t offx, const Dtype beta, int incx,
+		Dtype* y, size_t offy, int incy);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-	const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-	Dtype* y);
+		const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+		Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
-	Dtype* Y);
+		Dtype* Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-	Dtype* Y);
+		Dtype* Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-	const Dtype beta, Dtype* Y);
+		const Dtype beta, Dtype* Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-	const Dtype beta, Dtype* Y);
+		const Dtype beta, Dtype* Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_copy(const int N, const Dtype *X, Dtype *Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
 
 inline void caffe_memset(const size_t N, const int alpha, void* X) {
@@ -130,67 +130,67 @@ inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha,
-	Dtype *X);
+		Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
 //CUDA version, need to be deleted
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a,
-	const Dtype* b, Dtype* y);
+		const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
 //CUDA version, need to be deleted
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
 unsigned int caffe_rng_rand();
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
@@ -202,52 +202,52 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r);
 // specification of curandGenerateUniform.  With a = 0, b = 1, just calls
 // curandGenerateUniform; with other limits will shift and scale the outputs
 // appropriately after calling curandGenerateUniform.
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-	Dtype* r);
+		Dtype* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-	Dtype* r);
+		Dtype* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
-	const Dtype* y);
+		const Dtype* y);
 
 // Returns the sum of the absolute values of the elements of vector x
-template<typename Dtype>
+template <typename Dtype>
 Dtype caffe_cpu_asum(const int n, const Dtype* x);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
 
 // the branchless, type-safe version from
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
-template<typename Dtype>
+template <typename Dtype>
 inline char caffe_sign(Dtype val) {
 	return (Dtype(0) < val) - (val < Dtype(0));
 }
@@ -288,7 +288,7 @@ void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
 DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
 // This returns a nonzero value if the input has its sign bit set.
@@ -296,56 +296,56 @@ void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 using std::signbit;
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i]));
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
 
 DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_abs(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_log(const int n, const Dtype* a, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
-	const Dtype* y, const int incy);
+		const Dtype* y, const int incy);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index e0d4d489..06262fbf 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -81,14 +81,14 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
 // in standard blas. We will simply use a two-step (inefficient, of course) way
 // to mimic that.
 inline void cblas_saxpby(const int N, const float alpha, const float* X,
-	const int incX, const float beta, float* Y,
-	const int incY) {
+		const int incX, const float beta, float* Y,
+		const int incY) {
 	cblas_sscal(N, beta, Y, incY);
 	cblas_saxpy(N, alpha, X, incX, Y, incY);
 }
 inline void cblas_daxpby(const int N, const double alpha, const double* X,
-	const int incX, const double beta, double* Y,
-	const int incY) {
+		const int incX, const double beta, double* Y,
+		const int incY) {
 	cblas_dscal(N, beta, Y, incY);
 	cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 1bd7c8d4..9febaa04 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -29,11 +29,11 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
-	const int count);
+		const int count);
 
 void eventCallback(cl_event event, cl_int event_status, void * user_data);
 }  // namespace caffe
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index dbd712ea..3a9eaa5c 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -31,7 +31,7 @@ namespace caffe {
 
 typedef unsigned int uint32_t;
 
-template<typename dtype> inline std::string get_dtype_suffix()
+template <typename dtype> inline std::string get_dtype_suffix()
 {
 	dtype x;
 	const char type = typeid(x).name()[0];
@@ -50,289 +50,293 @@ template<typename dtype> inline std::string get_dtype_suffix()
 	return suffix;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
-	const int M_, const int packing_num);
+		const int M_, const int packing_num);
 
-template<typename Dtype>
+template <typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-	const int height, const int width, Dtype* data_opt, const int opt_offset,
-	const int optnum);
+		const int height, const int width, Dtype* data_opt, const int opt_offset,
+		const int optnum);
 
-template<typename Dtype>
+template <typename Dtype>
 void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* bottom_data, Dtype* scale_data);
+		const Dtype* bottom_data, Dtype* scale_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* scale, Dtype* data);
+		const Dtype* scale, Dtype* data);
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
+		const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
 
-template<typename Dtype>
+template <typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data,
-	const Dtype* label);
+		const Dtype* label);
 
-template<typename Dtype>
+template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	Dtype* top_data);
+		const Dtype* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
-	const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
-	Dtype* top_mask);
+		const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+		Dtype* top_mask);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-	const int* const mask, const Dtype* const top_mask, const int num,
-	const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-	const int pad_w, Dtype* const bottom_diff);
+		const int* const mask, const Dtype* const top_mask, const int num,
+		const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+		const int pad_w, Dtype* const bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-	const int num, const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-	const int pad_w, Dtype* const bottom_diff);
+		const int num, const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+		const int pad_w, Dtype* const bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
-	const Dtype* const top_diff, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, Dtype* const bottom_diff);
-template<typename Dtype>
+		const Dtype* const top_diff, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, Dtype* const bottom_diff);
+template <typename Dtype>
 void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidBackward(const int count, const Dtype* top_diff,
-	const Dtype* top_data, Dtype* bottom_diff);
+		const Dtype* top_data, Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
-	Dtype* bottom_diff);
+		Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ThresholdForward(const int count, const Dtype threshold,
-	const Dtype* bottom_data, Dtype* top_data);
+		const Dtype* bottom_data, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, Dtype* top_data);
+		const Dtype* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
-	const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, Dtype* top_data);
+		const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	Dtype* idx_data, Dtype* top_data);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		Dtype* idx_data, Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolForwardTest(const int count, const Dtype* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	Dtype* top_data);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		Dtype* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, Dtype* bottom_diff);
+		const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
-	const int clnum, const int channels_, const int intheight_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, const int pad_, Dtype* bottom_diff);
+		const int clnum, const int channels_, const int intheight_,
+		const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, const int pad_, Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUForward(const int count, const int channels, const int dim,
-	const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
-	const int div_factor);
+		const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+		const int div_factor);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUBackward(const int count, const int channels, const int dim,
-	const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
-	const Dtype* slope_data, const int div_factor);
+		const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+		const Dtype* slope_data, const int div_factor);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUParamBackward(const int count, const Dtype* top_diff,
-	const int offset_out, const Dtype* bottom_data, const int offset_in,
-	Dtype* bottom_diff);
+		const int offset_out, const Dtype* bottom_data, const int offset_in,
+		Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
-	Dtype negative_slope);
+		Dtype negative_slope);
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLUBackward(const int count, const Dtype* top_diff,
-	const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+		const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data,
-	const int* MaskMem, const Dtype scale_, Dtype *top_data);
+		const int* MaskMem, const Dtype scale_, Dtype *top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-	const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+		const float threshold_, const Dtype scale_, Dtype* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
-	Dtype threshold);
+		Dtype threshold);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-	const int spatial_dim, const Dtype* data, Dtype* out);
+		const int spatial_dim, const Dtype* data, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_subtract(const int count,
-	const int num, const int channels,
-	const int spatial_dim, const Dtype* channel_max, Dtype* data);
+		const int num, const int channels,
+		const int spatial_dim, const Dtype* channel_max, Dtype* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
-	Dtype* out);
+		Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_log(const int count, const Dtype* data, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_add_scalar(const int count, const Dtype data, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-	const int spatial_dim, const Dtype* data, Dtype* channel_sum);
+		const int spatial_dim, const Dtype* data, Dtype* channel_sum);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
-	const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+		const int spatial_dim, const Dtype* channel_sum, Dtype* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-	const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-	Dtype* channel_dot);
+		const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+		Dtype* channel_dot);
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLossForwardGPU(const int nthreads,
-	const Dtype* prob_data, const Dtype* label, Dtype* loss,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_,
-	Dtype* counts);
+		const Dtype* prob_data, const Dtype* label, Dtype* loss,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_,
+		Dtype* counts);
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-	const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-	const int spatial_dim, const bool has_ignore_label_,
-	const int ignore_label_, Dtype* counts);
+		const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+		const int spatial_dim, const bool has_ignore_label_,
+		const int ignore_label_, Dtype* counts);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
 
 template <typename Dtype>
 void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale);
+		const int num, const int channels, const int height,
+		const int width, const int size, const Dtype alpha_over_size,
+		const Dtype k, Dtype* const scale);
 
 template <typename Dtype>
 void LRNComputeOutput(int nthreads, const Dtype* in,
-     Dtype* scale, Dtype negative_beta, Dtype* out);
+		Dtype* scale, Dtype negative_beta, Dtype* out);
 
 template <typename Dtype>
 void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff);
+		const Dtype* const bottom_data, const Dtype* const top_data,
+		const Dtype* const scale, const Dtype* const top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int size, const Dtype negative_beta,
+		const Dtype cache_ratio, Dtype* const bottom_diff);
 template <typename Dtype>
-void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y);
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
-void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
 
 template <typename Dtype>
-void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff);
+void BNLLBackward(const int count, const Dtype* top_diff,
+		const Dtype* bottom_data, Dtype *bottom_diff);
 
 template <typename Dtype>
-void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data);
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+		const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, Dtype *out_data);
 
 template <typename Dtype>
 void CLLBackward(const int count, const int channels,
-	const Dtype margin, const bool legacy_version, const Dtype alpha,
-	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-	Dtype *bottom_diff);
+		const Dtype margin, const bool legacy_version, const Dtype alpha,
+		const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+		Dtype *bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-	const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-	int* mask);
+		const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+		int* mask);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff,
-	const int blob_idx, const int* mask, Dtype* bottom_diff);
+		const int blob_idx, const int* mask, Dtype* bottom_diff);
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
 // namespace caffe
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index b59d9a67..7688e16a 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -18,9 +18,9 @@ inline rng_t* caffe_rng() {
 }
 
 // Fisher–Yates algorithm
-template<class RandomAccessIterator, class RandomGenerator>
+template <class RandomAccessIterator, class RandomGenerator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
-	RandomGenerator* gen) {
+		RandomGenerator* gen) {
 	typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
 	difference_type;
 	typedef typename boost::uniform_int<difference_type> dist_type;
@@ -35,7 +35,7 @@ inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
 	}
 }
 
-template<class RandomAccessIterator>
+template <class RandomAccessIterator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) {
 	shuffle(begin, end, caffe_rng());
 }
diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index d140e029..2dc3cceb 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param);
 // taking its top blob as input.
 // Error if any of these above layers are not-conv layers.
 void UpgradeV0PaddingLayers(const NetParameter& param,
-	NetParameter* param_upgraded_pad);
+		NetParameter* param_upgraded_pad);
 
 // Upgrade a single V0LayerConnection to the V1LayerParameter format.
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-	V1LayerParameter* layer_param);
+		V1LayerParameter* layer_param);
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type);
 
@@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param);
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param);
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-	LayerParameter* layer_param);
+		LayerParameter* layer_param);
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type);
 
@@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
 
 // Read parameters from a file into a NetParameter proto message.
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-	NetParameter* param);
+		NetParameter* param);
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-	NetParameter* param);
+		NetParameter* param);
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index eb959190..0c954fa2 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -20,17 +20,18 @@ namespace caffe {
  * @brief Abstract base class that factors out the BLAS code common to
  *        ConvolutionLayer and DeconvolutionLayer.
  */
-template<typename Dtype>
+template <typename Dtype>
 class BaseConvolutionLayer: public Layer<Dtype> {
 	public:
 		explicit BaseConvolutionLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual ~BaseConvolutionLayer();
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline int MinBottomBlobs() const {
 			return 1;
@@ -47,31 +48,31 @@ class BaseConvolutionLayer: public Layer<Dtype> {
 		// The last argument in forward_cpu_gemm is so that we can skip the im2col if
 		// we just called weight_cpu_gemm with the same input.
 		void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
-			Dtype* output, bool skip_im2col = false);
+				Dtype* output, bool skip_im2col = false);
 		void forward_cpu_bias(Dtype* output, const Dtype* bias);
 		void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
-			Dtype* output);
+				Dtype* output);
 		void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
-			weights);
+				weights);
 		void backward_cpu_bias(Dtype* bias, const Dtype* input);
 		//opencl related setup
 		void ocl_setup();
 
 #ifndef CPU_ONLY
 		void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
-			Dtype* output, bool skip_im2col = false);
+				Dtype* output, bool skip_im2col = false);
 		void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
-			Dtype* output, bool skip_im2col = false);
+				Dtype* output, bool skip_im2col = false);
 		void forward_gpu_bias(Dtype* output, const Dtype* bias);
 		void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
 		void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
-			Dtype* col_output);
+				Dtype* col_output);
 		void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
-			Dtype* col_output);
+				Dtype* col_output);
 		void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
-			weights);
+				weights);
 		void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype*
-			weights);
+				weights);
 		void backward_gpu_bias(Dtype* bias, const Dtype* input);
 		#endif
 
@@ -97,44 +98,44 @@ class BaseConvolutionLayer: public Layer<Dtype> {
 		// wrap im2col/col2im so we don't have to remember the (long) argument lists
 		inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
 			im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
 		}
 		inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
 			col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
 		}
 #ifndef CPU_ONLY
 		inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
 			im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_,
-				conv_in_width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff,
-				0);
+					conv_in_width_,
+					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff,
+					0);
 		}
 		inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
 			col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_,
-				conv_in_width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data,
-				bottom_offset_);
+					conv_in_width_,
+					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data,
+					bottom_offset_);
 		}
 	protected:
 		inline void conv_im2col_gpu_opt(const Dtype* data) {
 			im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
-				conv_in_width_,
-				kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2);
+					conv_in_width_,
+					kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2);
 		}
 		inline void conv_col2im_gpu_opt(Dtype* data) {
 			col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
-				conv_in_width_,
-				kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
+					conv_in_width_,
+					kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
 		}
 	private:
 		inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
 			transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_,
-				M_ * opt_num2, opt_num2);
+					M_ * opt_num2, opt_num2);
 		}
 		inline void conv_transpose_gpu(const Dtype* data) {
 			opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
-				opt_num2);
+					opt_num2);
 		}
 	protected:
 		inline void gpu_memset(Dtype* data, Dtype value, int count) {
@@ -182,7 +183,7 @@ class BaseConvolutionLayer: public Layer<Dtype> {
  *   be filtered. col2im restores the output spatial structure by rolling up
  *   the output channel N' columns of the output matrix.
  */
-template<typename Dtype>
+template <typename Dtype>
 class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
 	public:
 		/**
@@ -214,7 +215,8 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
 		 *    kernels + stream parallelism) engines.
 		 */
 		explicit ConvolutionLayer(const LayerParameter& param)
-			: BaseConvolutionLayer<Dtype>(param) {
+		:
+				BaseConvolutionLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -223,26 +225,26 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual inline bool reverse_dimensions() {
 			return false;
 		}
 		virtual void compute_output_shape();
 
 		virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -259,11 +261,12 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
  *   padding is removed from the output rather than added to the input, and
  *   stride results in upsampling rather than downsampling).
  */
-template<typename Dtype>
+template <typename Dtype>
 class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
 	public:
 		explicit DeconvolutionLayer(const LayerParameter& param)
-			: BaseConvolutionLayer<Dtype>(param) {
+		:
+				BaseConvolutionLayer<Dtype>(param) {
 		}
 
 		virtual inline const char* type() const {
@@ -272,13 +275,13 @@ class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual inline bool reverse_dimensions() {
 			return true;
 		}
@@ -306,16 +309,16 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
 	explicit CuDNNConvolutionLayer(const LayerParameter& param)
 	: ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNConvolutionLayer();
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t* handle_;
@@ -337,16 +340,17 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class Im2colLayer: public Layer<Dtype> {
 	public:
 		explicit Im2colLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Im2col";
@@ -360,13 +364,13 @@ class Im2colLayer: public Layer<Dtype> {
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int kernel_h_, kernel_w_;
 		int stride_h_, stride_w_;
@@ -376,8 +380,8 @@ class Im2colLayer: public Layer<Dtype> {
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
-template<typename Dtype> class PoolingLayer;
-template<typename Dtype> class SplitLayer;
+template <typename Dtype> class PoolingLayer;
+template <typename Dtype> class SplitLayer;
 
 /**
  * @brief Normalize the input in a local region across or within feature maps.
@@ -385,73 +389,81 @@ template<typename Dtype> class SplitLayer;
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class LRNLayer : public Layer<Dtype> {
- public:
-  explicit LRNLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "LRN"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int size_;
-  int pre_pad_;
-  Dtype alpha_;
-  Dtype beta_;
-  Dtype k_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-
-  // Fields used for normalization ACROSS_CHANNELS
-  // scale_ stores the intermediate summing results
-  Blob<Dtype> scale_;
-
-  // Fields used for normalization WITHIN_CHANNEL
-  shared_ptr<SplitLayer<Dtype> > split_layer_;
-  vector<Blob<Dtype>*> split_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > square_layer_;
-  Blob<Dtype> square_input_;
-  Blob<Dtype> square_output_;
-  vector<Blob<Dtype>*> square_bottom_vec_;
-  vector<Blob<Dtype>*> square_top_vec_;
-  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
-  Blob<Dtype> pool_output_;
-  vector<Blob<Dtype>*> pool_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > power_layer_;
-  Blob<Dtype> power_output_;
-  vector<Blob<Dtype>*> power_top_vec_;
-  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
-  Blob<Dtype> product_input_;
-  vector<Blob<Dtype>*> product_bottom_vec_;
+class LRNLayer: public Layer<Dtype> {
+	public:
+		explicit LRNLayer(const LayerParameter& param)
+		:
+				Layer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "LRN";
+		}
+		virtual inline int ExactNumBottomBlobs() const {
+			return 1;
+		}
+		virtual inline int ExactNumTopBlobs() const {
+			return 1;
+		}
+
+	protected:
+		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+		virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+		int size_;
+		int pre_pad_;
+		Dtype alpha_;
+		Dtype beta_;
+		Dtype k_;
+		int num_;
+		int channels_;
+		int height_;
+		int width_;
+
+		// Fields used for normalization ACROSS_CHANNELS
+		// scale_ stores the intermediate summing results
+		Blob<Dtype> scale_;
+
+		// Fields used for normalization WITHIN_CHANNEL
+		shared_ptr<SplitLayer<Dtype> > split_layer_;
+		vector<Blob<Dtype>*> split_top_vec_;
+		shared_ptr<PowerLayer<Dtype> > square_layer_;
+		Blob<Dtype> square_input_;
+		Blob<Dtype> square_output_;
+		vector<Blob<Dtype>*> square_bottom_vec_;
+		vector<Blob<Dtype>*> square_top_vec_;
+		shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+		Blob<Dtype> pool_output_;
+		vector<Blob<Dtype>*> pool_top_vec_;
+		shared_ptr<PowerLayer<Dtype> > power_layer_;
+		Blob<Dtype> power_output_;
+		vector<Blob<Dtype>*> power_top_vec_;
+		shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+		Blob<Dtype> product_input_;
+		vector<Blob<Dtype>*> product_bottom_vec_;
 
 };
 
@@ -460,16 +472,17 @@ class LRNLayer : public Layer<Dtype> {
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
-template<typename Dtype>
+template <typename Dtype>
 class PoolingLayer: public Layer<Dtype> {
 	public:
 		explicit PoolingLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "Pooling";
@@ -484,18 +497,18 @@ class PoolingLayer: public Layer<Dtype> {
 		// others can only output the pooled inputs.
 		virtual inline int MaxTopBlobs() const {
 			return (this->layer_param_.pooling_param().pool() ==
-				PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+					PoolingParameter_PoolMethod_MAX) ? 2 : 1;
 		}
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 		int kernel_h_, kernel_w_;
 		int stride_h_, stride_w_;
@@ -520,9 +533,9 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
 	explicit CuDNNPoolingLayer(const LayerParameter& param)
 	: PoolingLayer<Dtype>(param), handles_setup_(false) {}
 	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual ~CuDNNPoolingLayer();
 	// Currently, cuDNN does not support the extra top blob.
 	virtual inline int MinTopBlobs() const {return -1;}
@@ -530,9 +543,9 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
 
 	protected:
 	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top);
+			const vector<Blob<Dtype>*>& top);
 	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
 	bool handles_setup_;
 	cudnnHandle_t handle_;
@@ -548,16 +561,17 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
  *        so that the result vector of different sized
  *        images are of the same size.
  */
-template<typename Dtype>
+template <typename Dtype>
 class SPPLayer: public Layer<Dtype> {
 	public:
 		explicit SPPLayer(const LayerParameter& param)
-			: Layer<Dtype>(param) {
+		:
+				Layer<Dtype>(param) {
 		}
 		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 
 		virtual inline const char* type() const {
 			return "SPP";
@@ -572,18 +586,18 @@ class SPPLayer: public Layer<Dtype> {
 		// others can only output the pooled inputs.
 		virtual inline int MaxTopBlobs() const {
 			return (this->layer_param_.pooling_param().pool() ==
-				PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+					PoolingParameter_PoolMethod_MAX) ? 2 : 1;
 		}
 
 	protected:
 		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
+				const vector<Blob<Dtype>*>& top);
 		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 		// calculates the kernel and stride dimensions for the pooling layer,
 		// returns a correctly configured LayerParameter for a PoolingLayer
 		virtual LayerParameter GetPoolingParam(const int pyramid_level,
-			const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+				const int bottom_h, const int bottom_w, const SPPParameter spp_param);
 
 		int pyramid_height_;
 		int bottom_h_, bottom_w_;
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index e7d129bb..5e327c67 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -8,9 +8,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
-	const int width) {
+		const int width) {
 	vector<int> shape(4);
 	shape[0] = num;
 	shape[1] = channels;
@@ -19,7 +19,7 @@ void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
 	Reshape(shape);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::Reshape(const vector<int>& shape) {
 	CHECK_LE(shape.size(), kMaxBlobAxes);
 	count_ = 1;
@@ -37,7 +37,7 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::Reshape(const BlobShape& shape) {
 	CHECK_LE(shape.dim_size(), kMaxBlobAxes);
 	vector<int> shape_vec(shape.dim_size());
@@ -47,93 +47,95 @@ void Blob<Dtype>::Reshape(const BlobShape& shape) {
 	Reshape(shape_vec);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
 	Reshape(other.shape());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Blob<Dtype>::Blob(const int num, const int channels, const int height,
-	const int width)
-	// capacity_ must be initialized before calling Reshape
-	: capacity_(0) {
+		const int width)
+// capacity_ must be initialized before calling Reshape
+:
+		capacity_(0) {
 	Reshape(num, channels, height, width);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Blob<Dtype>::Blob(const vector<int>& shape)
-	// capacity_ must be initialized before calling Reshape
-	: capacity_(0) {
+// capacity_ must be initialized before calling Reshape
+:
+		capacity_(0) {
 	Reshape(shape);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
 	CHECK (data_);
 	return (const Dtype*) data_->cpu_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
 	CHECK(data);
 	data_->set_cpu_data(data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_data() const {
 	CHECK (data_);
 	return (const Dtype*) data_->gpu_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_cache_data() const {
 	CHECK (data_);
 	return (const Dtype*) data_->gpu_cache_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
 	CHECK (diff_);
 	return (const Dtype*) diff_->cpu_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_diff() const {
 	CHECK (diff_);
 	return (const Dtype*) diff_->gpu_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_data() {
 	CHECK (data_);
 	return static_cast<Dtype*>(data_->mutable_cpu_data());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_data() {
 	CHECK (data_);
 	return static_cast<Dtype*>(data_->mutable_gpu_data());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_diff() {
 	CHECK (diff_);
 	return static_cast<Dtype*>(diff_->mutable_cpu_data());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_diff() {
 	CHECK (diff_);
 	return static_cast<Dtype*>(diff_->mutable_gpu_data());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::ShareData(const Blob& other) {
 	CHECK_EQ(count_, other.count());
 	data_ = other.data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::ShareDiff(const Blob& other) {
 	CHECK_EQ(count_, other.count());
 	diff_ = other.diff();
@@ -142,30 +144,30 @@ void Blob<Dtype>::ShareDiff(const Blob& other) {
 // The "update" method is used for parameter blobs in a Net, which are stored
 // as Blob<float> or Blob<double> -- hence we do not define it for
 // Blob<int> or Blob<unsigned int>.
-template<> void Blob<unsigned int>::Update() {
+template <> void Blob<unsigned int>::Update() {
 	NOT_IMPLEMENTED;
 }
-template<> void Blob<int>::Update() {
+template <> void Blob<int>::Update() {
 	NOT_IMPLEMENTED;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::Update() {
 	// We will perform update based on where the data is located.
 	switch (data_->head()) {
 		case SyncedMemory::HEAD_AT_CPU:
 			// perform computation on CPU
 			caffe_axpy < Dtype > (count_, Dtype(-1),
-				static_cast<const Dtype*>(diff_->cpu_data()),
-				static_cast<Dtype*>(data_->mutable_cpu_data()));
+					static_cast<const Dtype*>(diff_->cpu_data()),
+					static_cast<Dtype*>(data_->mutable_cpu_data()));
 			break;
 		case SyncedMemory::HEAD_AT_GPU:
 			case SyncedMemory::SYNCED:
 			#ifndef CPU_ONLY
 			// perform computation on GPU
 			caffe_gpu_axpy < Dtype > (count_, Dtype(-1),
-				static_cast<const Dtype*>(diff_->gpu_data()),
-				static_cast<Dtype*>(data_->mutable_gpu_data()));
+					static_cast<const Dtype*>(diff_->gpu_data()),
+					static_cast<Dtype*>(data_->mutable_gpu_data()));
 #else
 			NO_GPU;
 #endif
@@ -175,17 +177,17 @@ void Blob<Dtype>::Update() {
 	}
 }
 
-template<> unsigned int Blob<unsigned int>::asum_data() const {
+template <> unsigned int Blob<unsigned int>::asum_data() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<> int Blob<int>::asum_data() const {
+template <> int Blob<int>::asum_data() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Blob<Dtype>::asum_data() const {
 	if (!data_) {
 		return 0;
@@ -212,17 +214,17 @@ Dtype Blob<Dtype>::asum_data() const {
 	return 0;
 }
 
-template<> unsigned int Blob<unsigned int>::asum_diff() const {
+template <> unsigned int Blob<unsigned int>::asum_diff() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<> int Blob<int>::asum_diff() const {
+template <> int Blob<int>::asum_diff() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Blob<Dtype>::asum_diff() const {
 	if (!diff_) {
 		return 0;
@@ -249,17 +251,17 @@ Dtype Blob<Dtype>::asum_diff() const {
 	return 0;
 }
 
-template<> unsigned int Blob<unsigned int>::sumsq_data() const {
+template <> unsigned int Blob<unsigned int>::sumsq_data() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<> int Blob<int>::sumsq_data() const {
+template <> int Blob<int>::sumsq_data() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_data() const {
 	Dtype sumsq;
 	const Dtype* data;
@@ -288,17 +290,17 @@ Dtype Blob<Dtype>::sumsq_data() const {
 	return sumsq;
 }
 
-template<> unsigned int Blob<unsigned int>::sumsq_diff() const {
+template <> unsigned int Blob<unsigned int>::sumsq_diff() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<> int Blob<int>::sumsq_diff() const {
+template <> int Blob<int>::sumsq_diff() const {
 	NOT_IMPLEMENTED;
 	return 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_diff() const {
 	Dtype sumsq;
 	const Dtype* diff;
@@ -327,15 +329,15 @@ Dtype Blob<Dtype>::sumsq_diff() const {
 	return sumsq;
 }
 
-template<> void Blob<unsigned int>::scale_data(unsigned int scale_factor) {
+template <> void Blob<unsigned int>::scale_data(unsigned int scale_factor) {
 	NOT_IMPLEMENTED;
 }
 
-template<> void Blob<int>::scale_data(int scale_factor) {
+template <> void Blob<int>::scale_data(int scale_factor) {
 	NOT_IMPLEMENTED;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::scale_data(Dtype scale_factor) {
 	Dtype* data;
 	if (!data_) {
@@ -362,15 +364,15 @@ void Blob<Dtype>::scale_data(Dtype scale_factor) {
 	}
 }
 
-template<> void Blob<unsigned int>::scale_diff(unsigned int scale_factor) {
+template <> void Blob<unsigned int>::scale_diff(unsigned int scale_factor) {
 	NOT_IMPLEMENTED;
 }
 
-template<> void Blob<int>::scale_diff(int scale_factor) {
+template <> void Blob<int>::scale_diff(int scale_factor) {
 	NOT_IMPLEMENTED;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::scale_diff(Dtype scale_factor) {
 	Dtype* diff;
 	if (!diff_) {
@@ -397,10 +399,10 @@ void Blob<Dtype>::scale_diff(Dtype scale_factor) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
 	if (other.has_num() || other.has_channels() ||
-		other.has_height() || other.has_width()) {
+			other.has_height() || other.has_width()) {
 		// Using deprecated 4D Blob dimensions --
 		// shape is (num, channels, height, width).
 		// Note: we do not use the normal Blob::num(), Blob::channels(), etc.
@@ -408,10 +410,10 @@ bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
 		// parameter blobs were indexed from the end of the blob shape (e.g., bias
 		// Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
 		return shape_.size() <= 4 &&
-			LegacyShape(-4) == other.num() &&
-			LegacyShape(-3) == other.channels() &&
-			LegacyShape(-2) == other.height() &&
-			LegacyShape(-1) == other.width();
+				LegacyShape(-4) == other.num() &&
+				LegacyShape(-3) == other.channels() &&
+				LegacyShape(-2) == other.height() &&
+				LegacyShape(-1) == other.width();
 	}
 	vector<int> other_shape(other.shape().dim_size());
 	for (int i = 0; i < other.shape().dim_size(); ++i) {
@@ -420,7 +422,7 @@ bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
 	return shape_ == other_shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
 	if (source.count() != count_ || source.shape() != shape_) {
 		if (reshape) {
@@ -433,19 +435,19 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
 		case Caffe::GPU:
 			if (copy_diff) {
 				caffe_copy(count_, source.gpu_diff(),
-					static_cast<Dtype*>(diff_->mutable_gpu_data()));
+						static_cast<Dtype*>(diff_->mutable_gpu_data()));
 			} else {
 				caffe_copy(count_, source.gpu_data(),
-					static_cast<Dtype*>(data_->mutable_gpu_data()));
+						static_cast<Dtype*>(data_->mutable_gpu_data()));
 			}
 			break;
 		case Caffe::CPU:
 			if (copy_diff) {
 				caffe_copy(count_, source.cpu_diff(),
-					static_cast<Dtype*>(diff_->mutable_cpu_data()));
+						static_cast<Dtype*>(diff_->mutable_cpu_data()));
 			} else {
 				caffe_copy(count_, source.cpu_data(),
-					static_cast<Dtype*>(data_->mutable_cpu_data()));
+						static_cast<Dtype*>(data_->mutable_cpu_data()));
 			}
 			break;
 		default:
@@ -453,12 +455,12 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
 	if (reshape) {
 		vector<int> shape;
 		if (proto.has_num() || proto.has_channels() ||
-			proto.has_height() || proto.has_width()) {
+				proto.has_height() || proto.has_width()) {
 			// Using deprecated 4D Blob dimensions --
 			// shape is (num, channels, height, width).
 			shape.resize(4);
@@ -489,7 +491,7 @@ void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
 	proto->clear_shape();
 	for (int i = 0; i < shape_.size(); ++i) {
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 22e9059b..2157c96a 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -91,7 +91,7 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
 {
-        amdDevice.Init();
+	amdDevice.Init();
 	cl_int err = clblasSetup();
 	if (err != CL_SUCCESS) {
 		LOG(ERROR) << "clBLAS setup failed " << err;
@@ -121,10 +121,12 @@ void Caffe::DeviceQuery() {
 class Caffe::RNG::Generator {
 	public:
 		Generator()
-			: rng_(new caffe::rng_t(cluster_seedgen())) {
+		:
+				rng_(new caffe::rng_t(cluster_seedgen())) {
 		}
 		explicit Generator(unsigned int seed)
-			: rng_(new caffe::rng_t(seed)) {
+		:
+				rng_(new caffe::rng_t(seed)) {
 		}
 		caffe::rng_t* rng() {
 			return rng_.get();
@@ -134,11 +136,13 @@ class Caffe::RNG::Generator {
 };
 
 Caffe::RNG::RNG()
-	: generator_(new Generator()) {
+:
+		generator_(new Generator()) {
 }
 
 Caffe::RNG::RNG(unsigned int seed)
-	: generator_(new Generator(seed)) {
+:
+		generator_(new Generator(seed)) {
 }
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 892d758d..a041e126 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -10,14 +10,15 @@
 #include "caffe/util/benchmark.hpp"
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
-	Phase phase)
-	: param_(param), phase_(phase) {
+		Phase phase)
+:
+		param_(param), phase_(phase) {
 	// check if we want to use mean_file
 	if (param_.has_mean_file()) {
 		CHECK_EQ(param_.mean_value_size(), 0) <<
-			"Cannot specify mean_file and mean_value at the same time";
+				"Cannot specify mean_file and mean_value at the same time";
 		const string& mean_file = param.mean_file();
 		LOG(INFO) << "Loading mean file from: " << mean_file;
 		BlobProto blob_proto;
@@ -27,16 +28,16 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
 	// check if we want to use mean_value
 	if (param_.mean_value_size() > 0) {
 		CHECK(param_.has_mean_file() == false) <<
-			"Cannot specify mean_file and mean_value at the same time";
+				"Cannot specify mean_file and mean_value at the same time";
 		for (int c = 0; c < param_.mean_value_size(); ++c) {
 			mean_values_.push_back(param_.mean_value(c));
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-	Dtype* transformed_data) {
+		Dtype* transformed_data) {
 	const string& data = datum.data();
 	const int datum_channels = datum.channels();
 	const int datum_height = datum.height();
@@ -62,7 +63,8 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 	}
 	if (has_mean_values) {
 		CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
-			"Specify either 1 mean_value or as many as channels: " << datum_channels;
+				"Specify either 1 mean_value or as many as channels: "
+				<< datum_channels;
 		if (datum_channels > 1 && mean_values_.size() == 1) {
 			// Replicate the mean_value for simplicity
 			for (int c = 1; c < datum_channels; ++c) {
@@ -102,17 +104,17 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 				}
 				if (has_uint8) {
 					datum_element =
-						static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
+							static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
 				} else {
 					datum_element = datum.float_data(data_index);
 				}
 				if (has_mean_file) {
 					transformed_data[top_index] =
-						(datum_element - mean[data_index]) * scale;
+							(datum_element - mean[data_index]) * scale;
 				} else {
 					if (has_mean_values) {
 						transformed_data[top_index] =
-							(datum_element - mean_values_[c]) * scale;
+								(datum_element - mean_values_[c]) * scale;
 					} else {
 						transformed_data[top_index] = datum_element * scale;
 					}
@@ -122,14 +124,14 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-	Blob<Dtype>* transformed_blob) {
+		Blob<Dtype>* transformed_blob) {
 
 	// If datum is encoded, decoded and transform the cv::image.
 	if (datum.encoded()) {
 		CHECK(!(param_.force_color() && param_.force_gray()))
-			<< "cannot set both force_color and force_gray";
+				<< "cannot set both force_color and force_gray";
 		cv::Mat cv_img;
 		if (param_.force_color() || param_.force_gray()) {
 			// If force_color then decode in color otherwise decode in gray.
@@ -173,9 +175,9 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 	Transform(datum, transformed_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
-	Blob<Dtype>* transformed_blob) {
+		Blob<Dtype>* transformed_blob) {
 	const int datum_num = datum_vector.size();
 	const int num = transformed_blob->num();
 	const int channels = transformed_blob->channels();
@@ -183,8 +185,9 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
 	const int width = transformed_blob->width();
 
 	CHECK_GT(datum_num, 0) << "There is no datum to add";
-	CHECK_LE(datum_num, num) <<
-		"The size of datum_vector must be no greater than transformed_blob->num()";
+	CHECK_LE(datum_num, num)
+			<<
+			"The size of datum_vector must be no greater than transformed_blob->num()";
 	Blob < Dtype > uni_blob(1, channels, height, width);
 	for (int item_id = 0; item_id < datum_num; ++item_id) {
 		int offset = transformed_blob->offset(item_id);
@@ -193,9 +196,9 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
-	Blob<Dtype>* transformed_blob) {
+		Blob<Dtype>* transformed_blob) {
 	const int mat_num = mat_vector.size();
 	const int num = transformed_blob->num();
 	const int channels = transformed_blob->channels();
@@ -204,7 +207,7 @@ void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
 
 	CHECK_GT(mat_num, 0) << "There is no MAT to add";
 	CHECK_EQ(mat_num, num) <<
-		"The size of mat_vector must be equals to transformed_blob->num()";
+			"The size of mat_vector must be equals to transformed_blob->num()";
 	Blob < Dtype > uni_blob(1, channels, height, width);
 	for (int item_id = 0; item_id < mat_num; ++item_id) {
 		int offset = transformed_blob->offset(item_id);
@@ -213,9 +216,9 @@ void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
-	Blob<Dtype>* transformed_blob) {
+		Blob<Dtype>* transformed_blob) {
 	const int crop_size = param_.crop_size();
 	const int img_channels = cv_img.channels();
 	const int img_height = cv_img.rows;
@@ -252,7 +255,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
 	}
 	if (has_mean_values) {
 		CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
-			"Specify either 1 mean_value or as many as channels: " << img_channels;
+				"Specify either 1 mean_value or as many as channels: " << img_channels;
 		if (img_channels > 1 && mean_values_.size() == 1) {
 			// Replicate the mean_value for simplicity
 			for (int c = 1; c < img_channels; ++c) {
@@ -301,11 +304,11 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
 				if (has_mean_file) {
 					int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
 					transformed_data[top_index] =
-						(pixel - mean[mean_index]) * scale;
+							(pixel - mean[mean_index]) * scale;
 				} else {
 					if (has_mean_values) {
 						transformed_data[top_index] =
-							(pixel - mean_values_[c]) * scale;
+								(pixel - mean_values_[c]) * scale;
 					} else {
 						transformed_data[top_index] = pixel * scale;
 					}
@@ -315,9 +318,9 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
-	Blob<Dtype>* transformed_blob) {
+		Blob<Dtype>* transformed_blob) {
 	const int crop_size = param_.crop_size();
 	const int input_num = input_blob->num();
 	const int input_channels = input_blob->channels();
@@ -328,10 +331,10 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 		// Initialize transformed_blob with the right shape.
 		if (crop_size) {
 			transformed_blob->Reshape(input_num, input_channels,
-				crop_size, crop_size);
+					crop_size, crop_size);
 		} else {
 			transformed_blob->Reshape(input_num, input_channels,
-				input_height, input_width);
+					input_height, input_width);
 		}
 	}
 
@@ -377,13 +380,14 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 		for (int n = 0; n < input_num; ++n) {
 			int offset = input_blob->offset(n);
 			caffe_sub(data_mean_.count(), input_data + offset,
-				data_mean_.cpu_data(), input_data + offset);
+					data_mean_.cpu_data(), input_data + offset);
 		}
 	}
 
 	if (has_mean_values) {
 		CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
-			"Specify either 1 mean_value or as many as channels: " << input_channels;
+				"Specify either 1 mean_value or as many as channels: "
+				<< input_channels;
 		if (mean_values_.size() == 1) {
 			caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
 		} else {
@@ -391,7 +395,7 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 				for (int c = 0; c < input_channels; ++c) {
 					int offset = input_blob->offset(n, c);
 					caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
-						input_data + offset);
+							input_data + offset);
 				}
 			}
 		}
@@ -427,11 +431,11 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
 	if (datum.encoded()) {
 		CHECK(!(param_.force_color() && param_.force_gray()))
-			<< "cannot set both force_color and force_gray";
+				<< "cannot set both force_color and force_gray";
 		cv::Mat cv_img;
 		if (param_.force_color() || param_.force_gray()) {
 			// If force_color then decode in color otherwise decode in gray.
@@ -460,9 +464,9 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
 	return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-	const vector<Datum> & datum_vector) {
+		const vector<Datum> & datum_vector) {
 	const int num = datum_vector.size();
 	CHECK_GT(num, 0) << "There is no datum to in the vector";
 	// Use first datum in the vector to InferBlobShape.
@@ -472,7 +476,7 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
 	return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
 	const int crop_size = param_.crop_size();
 	const int img_channels = cv_img.channels();
@@ -491,9 +495,9 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
 	return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-	const vector<cv::Mat> & mat_vector) {
+		const vector<cv::Mat> & mat_vector) {
 	const int num = mat_vector.size();
 	CHECK_GT(num, 0) << "There is no cv_img to in the vector";
 	// Use first cv_img in the vector to InferBlobShape.
@@ -503,10 +507,10 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
 	return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
 	const bool needs_rand = param_.mirror() ||
-		(phase_ == TRAIN && param_.crop_size());
+			(phase_ == TRAIN && param_.crop_size());
 	if (needs_rand) {
 		const unsigned int rng_seed = caffe_rng_rand();
 		rng_.reset(new Caffe::RNG(rng_seed));
@@ -515,12 +519,12 @@ void DataTransformer<Dtype>::InitRand() {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 int DataTransformer<Dtype>::Rand(int n) {
 	CHECK (rng_);
 	CHECK_GT(n, 0);
 	caffe::rng_t* rng =
-		static_cast<caffe::rng_t*>(rng_->generator());
+			static_cast<caffe::rng_t*>(rng_->generator());
 	return ((*rng)() % n);
 }
 
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 689f706e..9e53a66a 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -58,7 +58,7 @@ cl_int Device::Init(int deviceId) {
 
 	size_t nameLen;
 	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
-		platformName, &nameLen);
+			platformName, &nameLen);
 	if (res != CL_SUCCESS) {
 		fprintf(stderr, "Err: Failed to Get Platform Info\n");
 		return 0;
@@ -75,13 +75,14 @@ cl_int Device::Init(int deviceId) {
 	} else {
 		pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id));
 		OCL_CHECK(
-			clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, pDevices,
-				&uiNumDevices));
+				clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices,
+						pDevices,
+						&uiNumDevices));
 		if (deviceId == -1) {
 			int i;
 			for (i = 0; i < (int) uiNumDevices; i++) {
 				clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY,
-					sizeof(cl_bool), &unified_memory, NULL);
+						sizeof(cl_bool), &unified_memory, NULL);
 				if (!unified_memory) { //skip iGPU
 					//we pick the first dGPU we found
 					pDevices[0] = pDevices[i];
@@ -108,9 +109,9 @@ cl_int Device::Init(int deviceId) {
 		return 0;
 	}
 	CommandQueue = clCreateCommandQueue(Context, pDevices[0],
-		CL_QUEUE_PROFILING_ENABLE, NULL);
+			CL_QUEUE_PROFILING_ENABLE, NULL);
 	CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0],
-		CL_QUEUE_PROFILING_ENABLE, NULL);
+			CL_QUEUE_PROFILING_ENABLE, NULL);
 	if (NULL == CommandQueue || NULL == CommandQueue_helper) {
 		fprintf(stderr, "Err: Failed to Create Commandqueue\n");
 		return 0;
@@ -122,12 +123,12 @@ cl_int Device::Init(int deviceId) {
 }
 
 void Device::BuildProgram(std::string kernel_dir)
-	{
+		{
 	std::string strSource = "";
 	DIR *ocl_dir;
 	struct dirent *dirp;
 	if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL)
-		{
+			{
 		fprintf(stderr, "Err: Open ocl dir failed!\n");
 	}
 	while ((dirp = readdir(ocl_dir)) != NULL)
@@ -152,18 +153,18 @@ void Device::BuildProgram(std::string kernel_dir)
 	uiArrSourceSize[0] = strlen(pSource);
 	Program = NULL;
 	Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize,
-		NULL);
+			NULL);
 	if (NULL == Program) {
 		fprintf(stderr, "Err: Failed to create program\n");
 	}
 	cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(),
-		NULL, NULL);
+			NULL, NULL);
 	LOG(INFO) << "Build Program";
 	if (CL_SUCCESS != iStatus) {
 		fprintf(stderr, "Err: Failed to build program\n");
 		char szBuildLog[16384];
 		clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG,
-			sizeof(szBuildLog), szBuildLog, NULL);
+				sizeof(szBuildLog), szBuildLog, NULL);
 		std::cout << szBuildLog;
 		clReleaseProgram (Program);
 	}
@@ -198,10 +199,10 @@ cl_int Device::ConvertToString(std::string pFileName, std::string &Str) {
 }
 
 cl_kernel Device::GetKernel(std::string kernel_name)
-	{
+		{
 	std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
 	if (it == Kernels.end())
-		{
+			{
 		cl_int _err = 0;
 		cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err);
 		OCL_CHECK(_err);
@@ -214,7 +215,7 @@ void Device::ReleaseKernels()
 {
 	std::map<std::string, cl_kernel>::iterator it;
 	for (it = Kernels.begin(); it != Kernels.end(); it++)
-		{
+			{
 		clReleaseKernel(it->second);
 	}
 }
@@ -224,16 +225,16 @@ void Device::DisplayPlatformInfo() {
 
 	err = clGetPlatformIDs(0, NULL, &numPlatforms);
 	if (err != CL_SUCCESS || numPlatforms <= 0)
-		{
+			{
 		LOG(ERROR) << "Failed to find any OpenCL platform.";
 		return;
 	}
 
 	platformIDs = (cl_platform_id *) malloc(
-		sizeof(cl_platform_id) * numPlatforms);
+			sizeof(cl_platform_id) * numPlatforms);
 	err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(ERROR) << "Failed to find any OpenCL platform.";
 		return;
 	}
@@ -247,19 +248,19 @@ void Device::DisplayPlatformInfo() {
 		DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
 		DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
 		DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS,
-			"CL_PLATFORM_EXTENSIONS");
+				"CL_PLATFORM_EXTENSIONS");
 	}
 
 }
 
 void Device::DisplayInfo(cl_platform_id id, cl_platform_info name,
-	std::string str) {
+		std::string str) {
 	cl_int err;
 	std::size_t paramValueSize;
 
 	err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
 		return;
 	}
@@ -267,7 +268,7 @@ void Device::DisplayInfo(cl_platform_id id, cl_platform_info name,
 	char * info = (char *) alloca(sizeof(char) * paramValueSize);
 	err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
 		return;
 	}
@@ -280,10 +281,10 @@ void Device::GetDeviceInfo() {
 	//by default, we select the first platform. can be extended for more platforms
 	//query GPU device for now
 	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL,
-		&numDevices);
+			&numDevices);
 	// we allow program run if no GPU is found. Just return. No error reported.
 	if (numDevices < 1)
-		{
+			{
 		LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
 		LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
 		return;
@@ -291,9 +292,9 @@ void Device::GetDeviceInfo() {
 
 	DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
 	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices,
-		DeviceIDs, NULL);
+			DeviceIDs, NULL);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(INFO) << "Failed to find any GPU devices.";
 		return;
 	}
@@ -302,35 +303,35 @@ void Device::GetDeviceInfo() {
 	for (cl_uint i = 0; i < numDevices; i++) {
 		LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i];
 		DisplayDeviceInfo < cl_device_type
-			> (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+				> (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
 		DisplayDeviceInfo < cl_bool
-			> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+				> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
 		DisplayDeviceInfo < cl_uint
-			> (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+				> (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
 		DisplayDeviceInfo < cl_bool
-			> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+				> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
 		DisplayDeviceInfo < cl_bool
-			> (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+				> (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
 		DisplayDeviceInfo < cl_bool
-			> (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+				> (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
 		DisplayDeviceInfo < cl_uint
-			> (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+				> (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
 		DisplayDeviceInfo < size_t
-			> (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+				> (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
 		DisplayDeviceInfo < cl_uint
-			> (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+				> (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
 		DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES,
-			"Max work item sizes");
+				"Max work item sizes");
 		DisplayDeviceInfo < cl_command_queue_properties
-			> (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+				> (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
 		DisplayDeviceInfo < cl_device_exec_capabilities
-			> (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+				> (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
 		DisplayDeviceInfo < cl_ulong
-			> (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+				> (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
 		DisplayDeviceInfo < cl_ulong
-			> (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+				> (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
 		DisplayDeviceInfo < cl_ulong
-			> (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+				> (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
 	}
 
 }
@@ -345,7 +346,7 @@ void Device::DeviceQuery()
 
 	size_t nameLen;
 	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
-		platformName, &nameLen);
+			platformName, &nameLen);
 	if (res != CL_SUCCESS) {
 		fprintf(stderr, "Err: Failed to Get Platform Info\n");
 		return;
@@ -355,15 +356,15 @@ void Device::DeviceQuery()
 	GetDeviceInfo();
 }
 
-template<typename T>
+template <typename T>
 void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
-	std::string str) {
+		std::string str) {
 	cl_int err;
 	std::size_t paramValueSize;
 
 	err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
 		return;
 	}
@@ -372,7 +373,7 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
 	T * info = (T *) alloca(sizeof(T) * paramValueSize);
 	err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
 	if (err != CL_SUCCESS)
-		{
+			{
 		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
 		return;
 	}
@@ -382,20 +383,20 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
 			{
 			std::string deviceType;
 			appendBitfield < cl_device_type
-				> (
-				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
+					> (
+					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
 
 			appendBitfield < cl_device_type
-				> (
-				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
+					> (
+					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
 
 			appendBitfield < cl_device_type
-				> (
-				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
+					> (
+					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
 
 			appendBitfield < cl_device_type
-				> (
-				*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
+					> (
+					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
 
 			LOG(INFO) << "\t " << str << ":\t" << deviceType;
 		}
@@ -404,12 +405,12 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
 			{
 			std::string memType;
 			appendBitfield < cl_device_exec_capabilities
-				> (
-				*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
+					> (
+					*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
 
 			appendBitfield < cl_device_exec_capabilities
-				> (
-				*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
+					> (
+					*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
 
 			LOG(INFO) << "\t " << str << ":\t" << memType;
 
@@ -419,10 +420,10 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
 			{
 			std::string memType;
 			appendBitfield < cl_device_exec_capabilities
-				> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
+					> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
 
 			appendBitfield < cl_device_exec_capabilities
-				> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
+					> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
 
 			LOG(INFO) << "\t " << str << ":\t" << memType;
 		}
@@ -434,13 +435,13 @@ void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
 
 }
 
-template<typename T>
+template <typename T>
 void Device::appendBitfield(T info, T value, std::string name, std::string &str)
-	{
-	if (info & value)
 		{
-		if (str.length() > 0)
+	if (info & value)
 			{
+		if (str.length() > 0)
+				{
 			str.append(" | ");
 		}
 		str.append(name);
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 64f4fa6b..ba302ba8 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -17,7 +17,7 @@ bool InternalThread::StartInternalThread() {
 	}
 	try {
 		thread_.reset(
-			new boost::thread(&InternalThread::InternalThreadEntry, this));
+				new boost::thread(&InternalThread::InternalThreadEntry, this));
 	} catch (...) {
 		return false;
 	}
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 4ff6e3d4..a720ee92 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -17,9 +17,9 @@
 namespace caffe {
 
 // Get convolution layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetConvolutionLayer(
-	const LayerParameter& param) {
+		const LayerParameter& param) {
 	ConvolutionParameter_Engine engine = param.convolution_param().engine();
 	if (engine == ConvolutionParameter_Engine_DEFAULT) {
 		engine = ConvolutionParameter_Engine_CAFFE;
@@ -41,7 +41,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
 
 // Get pooling layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 	PoolingParameter_Engine engine = param.pooling_param().engine();
 	if (engine == PoolingParameter_Engine_DEFAULT) {
@@ -56,7 +56,7 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 	} else if (engine == PoolingParameter_Engine_CUDNN) {
 		PoolingParameter p_param = param.pooling_param();
 		if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
-			param.top_size() > 1) {
+				param.top_size() > 1) {
 			LOG(INFO) << "CUDNN does not support padding or multiple tops. "
 			<< "Using Caffe's own pooling layer.";
 			return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
@@ -71,7 +71,7 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 
 // Get relu layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
 	ReLUParameter_Engine engine = param.relu_param().engine();
 	if (engine == ReLUParameter_Engine_DEFAULT) {
@@ -94,7 +94,7 @@ shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
 REGISTER_LAYER_CREATOR(ReLU, GetReLULayer);
 
 // Get sigmoid layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
 	SigmoidParameter_Engine engine = param.sigmoid_param().engine();
 	if (engine == SigmoidParameter_Engine_DEFAULT) {
@@ -117,7 +117,7 @@ shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
 REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer);
 
 // Get softmax layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
 	SoftmaxParameter_Engine engine = param.softmax_param().engine();
 	if (engine == SoftmaxParameter_Engine_DEFAULT) {
@@ -140,7 +140,7 @@ shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
 REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer);
 
 // Get tanh layer according to engine.
-template<typename Dtype>
+template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
 	TanHParameter_Engine engine = param.tanh_param().engine();
 	if (engine == TanHParameter_Engine_DEFAULT) {
@@ -182,4 +182,4 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
 // Layers that use their constructor as their default creator should be
 // registered in their corresponding cpp files. Do not register them here.
 }
-  // namespace caffe
+ // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index cd99296e..85faa8d3 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -6,25 +6,25 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void AbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
-		"allow in-place computation.";
+			"allow in-place computation.";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	const int count = top[0]->count();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	caffe_abs(count, bottom[0]->cpu_data(), top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const int count = top[0]->count();
 	const Dtype* top_diff = top[0]->cpu_diff();
 	if (propagate_down[0]) {
@@ -35,17 +35,17 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int count = top[0]->count();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const int count = top[0]->count();
 	const Dtype* top_diff = top[0]->gpu_diff();
 	if (propagate_down[0]) {
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 82f92e27..a26839d4 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -10,39 +10,39 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void AccuracyLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	top_k_ = this->layer_param_.accuracy_param().top_k();
 
 	has_ignore_label_ =
-		this->layer_param_.accuracy_param().has_ignore_label();
+			this->layer_param_.accuracy_param().has_ignore_label();
 	if (has_ignore_label_) {
 		ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AccuracyLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
-		<< "top_k must be less than or equal to the number of classes.";
+			<< "top_k must be less than or equal to the number of classes.";
 	label_axis_ =
-		bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
+			bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
 	outer_num_ = bottom[0]->count(0, label_axis_);
 	inner_num_ = bottom[0]->count(label_axis_ + 1);
 	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-		<< "Number of labels must match number of predictions; "
-		<< "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
-		<< "label count (number of labels) must be N*H*W, "
-		<< "with integer values in {0, 1, ..., C-1}.";
+			<< "Number of labels must match number of predictions; "
+			<< "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
+			<< "label count (number of labels) must be N*H*W, "
+			<< "with integer values in {0, 1, ..., C-1}.";
 	vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
 	top[0]->Reshape(top_shape);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	Dtype accuracy = 0;
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const Dtype* bottom_label = bottom[1]->cpu_data();
@@ -54,7 +54,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	for (int i = 0; i < outer_num_; ++i) {
 		for (int j = 0; j < inner_num_; ++j) {
 			const int label_value =
-				static_cast<int>(bottom_label[i * inner_num_ + j]);
+					static_cast<int>(bottom_label[i * inner_num_ + j]);
 			if (has_ignore_label_ && label_value == ignore_label_) {
 				continue;
 			}
@@ -64,11 +64,11 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 			std::vector < std::pair<Dtype, int> > bottom_data_vector;
 			for (int k = 0; k < num_labels; ++k) {
 				bottom_data_vector.push_back(std::make_pair(
-					bottom_data[i * dim + k * inner_num_ + j], k));
+						bottom_data[i * dim + k * inner_num_ + j], k));
 			}
 			std::partial_sort(
-				bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-				bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+					bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
+					bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
 			// check if true label is in top k predictions
 			for (int k = 0; k < top_k_; k++) {
 				if (bottom_data_vector[k].second == label_value) {
diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp
index 87cc706e..235e8371 100644
--- a/src/caffe/layers/argmax_layer.cpp
+++ b/src/caffe/layers/argmax_layer.cpp
@@ -8,19 +8,19 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	out_max_val_ = this->layer_param_.argmax_param().out_max_val();
 	top_k_ = this->layer_param_.argmax_param().top_k();
 	CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
 	CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
-		<< "top_k must be less than or equal to the number of classes.";
+			<< "top_k must be less than or equal to the number of classes.";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	if (out_max_val_) {
 		// Produces max_ind and max_val
 		top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
@@ -30,9 +30,9 @@ void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	int num = bottom[0]->num();
@@ -41,11 +41,11 @@ void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 		std::vector < std::pair<Dtype, int> > bottom_data_vector;
 		for (int j = 0; j < dim; ++j) {
 			bottom_data_vector.push_back(
-				std::make_pair(bottom_data[i * dim + j], j));
+					std::make_pair(bottom_data[i * dim + j], j));
 		}
 		std::partial_sort(
-			bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-			bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+				bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
+				bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
 		for (int j = 0; j < top_k_; ++j) {
 			top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
 		}
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 97c9afd3..cefa8a66 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -16,26 +16,27 @@ template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::subTopMem = clCrea
 template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
 #endif
 
-template<typename Dtype>
-void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size)
-	{
+template <typename Dtype>
+void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) {
 	if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) {
 		ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size;
 		clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem);
 		ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context,
-			CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, NULL,
-			NULL);
+				CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size,
+				NULL,
+				NULL);
 	}
 	if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) {
 		ConvolutionLayer < Dtype > ::trans_mem_size = trans_size;
 		clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem);
 		ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context,
-			CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, NULL,
-			NULL);
+				CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size,
+				NULL,
+				NULL);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
 	M_ = num_output_ / group_;
 	K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
@@ -47,31 +48,31 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 #endif
 }
 
-template<typename Dtype>
+template <typename Dtype>
 BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer() {
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
+			<< "corresponding to (num, channels, height, width)";
 	// Configure the kernel size, padding, stride, and inputs.
 	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
 	CHECK(!conv_param.has_kernel_size() !=
-		!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-		<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+			!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
 	CHECK(conv_param.has_kernel_size() ||
-		(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-		<< "For non-square filters both kernel_h and kernel_w are required.";
+			(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+			<< "For non-square filters both kernel_h and kernel_w are required.";
 	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-		&& conv_param.has_pad_w())
-		|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-		<< "pad is pad OR pad_h and pad_w are required.";
+			&& conv_param.has_pad_w())
+			|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+			<< "pad is pad OR pad_h and pad_w are required.";
 	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-		&& conv_param.has_stride_w())
-		|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-		<< "Stride is stride OR stride_h and stride_w are required.";
+			&& conv_param.has_stride_w())
+			|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+			<< "Stride is stride OR stride_h and stride_w are required.";
 	if (conv_param.has_kernel_size()) {
 		kernel_h_ = kernel_w_ = conv_param.kernel_size();
 	} else {
@@ -95,7 +96,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	// Special case: im2col is the identity for 1x1 convolution with stride 1
 	// and no padding, so flag for skipping the buffer and transformation.
 	is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
-		&& stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
+			&& stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
 	// Configure output channels and groups.
 	channels_ = bottom[0]->channels();
 	num_output_ = this->layer_param_.convolution_param().num_output();
@@ -103,7 +104,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	group_ = this->layer_param_.convolution_param().group();
 	CHECK_EQ(channels_ % group_, 0);
 	CHECK_EQ(num_output_ % group_, 0)
-		<< "Number of output should be multiples of group.";
+			<< "Number of output should be multiples of group.";
 	if (reverse_dimensions()) {
 		conv_out_channels_ = channels_;
 		conv_in_channels_ = num_output_;
@@ -127,16 +128,16 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 		// Initialize and fill the weights:
 		// output channels x input channels per-group x kernel height x kernel width
 		this->blobs_[0].reset(new Blob<Dtype>(
-			conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
+				conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
 		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
-			this->layer_param_.convolution_param().weight_filler()));
+				this->layer_param_.convolution_param().weight_filler()));
 		weight_filler->Fill(this->blobs_[0].get());
 		// If necessary, initialize and fill the biases.
 		if (bias_term_) {
 			vector<int> bias_shape(1, num_output_);
 			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
 			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
-				this->layer_param_.convolution_param().bias_filler()));
+					this->layer_param_.convolution_param().bias_filler()));
 			bias_filler->Fill(this->blobs_[1].get());
 		}
 	}
@@ -144,25 +145,25 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
+			<< "corresponding to (num, channels, height, width)";
 	num_ = bottom[0]->num();
 	height_ = bottom[0]->height();
 	width_ = bottom[0]->width();
 	CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-		" convolution kernel.";
+			" convolution kernel.";
 	// TODO: generalize to handle inputs of different shapes.
 	for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
 		CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
 		CHECK_EQ(channels_, bottom[bottom_id]->channels())
-			<< "Inputs must have same channels.";
+				<< "Inputs must have same channels.";
 		CHECK_EQ(height_, bottom[bottom_id]->height())
-			<< "Inputs must have same height.";
+				<< "Inputs must have same height.";
 		CHECK_EQ(width_, bottom[bottom_id]->width())
-			<< "Inputs must have same width.";
+				<< "Inputs must have same width.";
 	}
 	// Shape the tops.
 	compute_output_shape();
@@ -195,15 +196,15 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
 		bias_multiplier_.Reshape(bias_multiplier_shape);
 		caffe_set(bias_multiplier_.count(), Dtype(1),
-			bias_multiplier_.mutable_cpu_data());
+				bias_multiplier_.mutable_cpu_data());
 	}
 	//initializa OpenCL kernels and cl_mem objects
 	ocl_setup();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
-	const Dtype* weights, Dtype* output, bool skip_im2col) {
+		const Dtype* weights, Dtype* output, bool skip_im2col) {
 	const Dtype* col_buff = input;
 	if (!is_1x1_) {
 		if (!skip_im2col) {
@@ -213,41 +214,42 @@ void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
 	}
 	for (int g = 0; g < group_; ++g) {
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-			group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-			(Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-			(Dtype) 0., output + output_offset_ * g);
+				group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+				(Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
+				(Dtype) 0., output + output_offset_ * g);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
-	const Dtype* bias) {
-	caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-		height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(),
-		(Dtype) 1., output);
+		const Dtype* bias) {
+	caffe_cpu_gemm < Dtype
+			> (CblasNoTrans, CblasNoTrans, num_output_,
+					height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(),
+					(Dtype) 1., output);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
-	const Dtype* weights, Dtype* input) {
+		const Dtype* weights, Dtype* input) {
 	Dtype* col_buff = col_buffer_.mutable_cpu_data();
 	if (is_1x1_) {
 		col_buff = input;
 	}
 	for (int g = 0; g < group_; ++g) {
 		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-			conv_out_spatial_dim_, conv_out_channels_ / group_,
-			(Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g,
-			(Dtype) 0., col_buff + col_offset_ * g);
+				conv_out_spatial_dim_, conv_out_channels_ / group_,
+				(Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g,
+				(Dtype) 0., col_buff + col_offset_ * g);
 	}
 	if (!is_1x1_) {
 		conv_col2im_cpu(col_buff, input);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
-	const Dtype* output, Dtype* weights) {
+		const Dtype* output, Dtype* weights) {
 	const Dtype* col_buff = input;
 	if (!is_1x1_) {
 		conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
@@ -255,26 +257,26 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
 	}
 	for (int g = 0; g < group_; ++g) {
 		caffe_cpu_gemm < Dtype
-			> (CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-				kernel_dim_ / group_, conv_out_spatial_dim_,
-				(Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g,
-				(Dtype) 1., weights + weight_offset_ * g);
+				> (CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
+						kernel_dim_ / group_, conv_out_spatial_dim_,
+						(Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g,
+						(Dtype) 1., weights + weight_offset_ * g);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
-	const Dtype* input) {
+		const Dtype* input) {
 	caffe_cpu_gemv < Dtype
-		> (CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-			input, bias_multiplier_.cpu_data(), 1., bias);
+			> (CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
+					input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
 #ifndef CPU_ONLY
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
-	const Dtype* weights, Dtype* output, bool skip_im2col) {
+		const Dtype* weights, Dtype* output, bool skip_im2col) {
 	const Dtype* col_buff = input;
 	if (!is_1x1_) {
 		if (!skip_im2col) {
@@ -285,17 +287,17 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
 
 	for (int g = 0; g < group_; ++g) {
 		caffe_gpu_gemm < Dtype
-			> (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
-				conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
-					/ group_,
-				(Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
-				(Dtype) 0., output, top_offset_ + output_offset_ * g);
+				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
+						conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
+								/ group_,
+						(Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
+						(Dtype) 0., output, top_offset_ + output_offset_ * g);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
-	const Dtype* weight, Dtype* output, bool skip_im2col) {
+		const Dtype* weight, Dtype* output, bool skip_im2col) {
 	cl_command_queue Queue;
 	const Dtype* col_buff = input;
 	if (!is_1x1_) {
@@ -305,15 +307,15 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
 		col_buff = col_buffer_.gpu_data();
 	} else {
 		caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff,
-			(Dtype*) transMem);
+				(Dtype*) transMem);
 	}
 #ifdef multiQ
 	for (int g = 0; g < group_; ++g) {
 		if(g == 0) Queue = amdDevice.CommandQueue;
 		else Queue = amdDevice.CommandQueue_helper;
 		caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-			(Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-			(Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
+				(Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+				(Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
 	}
 	if(group_ == 2) {
 		clFinish(amdDevice.CommandQueue);
@@ -323,63 +325,63 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
 	Queue = amdDevice.CommandQueue;
 	for (int g = 0; g < group_; ++g) {
 		caffe_gpu_gemm < Dtype
-			> (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-				(Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_
-					* g,
-				(Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g);
+				> (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+						(Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_
+								* g,
+						(Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g);
 	}
 #endif
 	transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_,
-		opt_num2);
+			opt_num2);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
-	const Dtype* bias) {
+		const Dtype* bias) {
 	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-		height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
-		reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-		(Dtype) 1., output, top_offset_);
+			height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
+			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+			(Dtype) 1., output, top_offset_);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
-	const Dtype* bias) {
+		const Dtype* bias) {
 	for (int z = 0; z < opt_num2; z++)
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-			N_, 1, (Dtype) 1., bias, 0,
-			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-			(Dtype) 1., output, top_offset_ + num_output_ * N_ * z);
+				N_, 1, (Dtype) 1., bias, 0,
+				reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+				(Dtype) 1., output, top_offset_ + num_output_ * N_ * z);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
-	const Dtype* weights, Dtype* input) {
+		const Dtype* weights, Dtype* input) {
 	Dtype* col_buff = col_buffer_.mutable_gpu_data();
 	if (is_1x1_) {
 		col_buff = input;
 	}
 	for (int g = 0; g < group_; ++g) {
 		caffe_gpu_gemm < Dtype
-			> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
-				/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
-				(Dtype) 1., weights, weight_offset_ * g,
-				output, top_offset_ + output_offset_ * g,
-				(Dtype) 0., col_buff, col_offset_ * g);
+				> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+						/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
+						(Dtype) 1., weights, weight_offset_ * g,
+						output, top_offset_ + output_offset_ * g,
+						(Dtype) 0., col_buff, col_offset_ * g);
 	}
 	if (!is_1x1_) {
 		conv_col2im_gpu(col_buff, input);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
-	const Dtype* weights, Dtype* input) {
+		const Dtype* weights, Dtype* input) {
 	cl_command_queue Queue;
 	if (is_1x1_) {
 		caffe_gpu_memcpy(
-			height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
-			(Dtype*) transMem);
+				height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
+				(Dtype*) transMem);
 	}
 	for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -389,10 +391,10 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 		Queue = amdDevice.CommandQueue;
 #endif
 		caffe_gpu_gemm < Dtype
-			> (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
-				(Dtype) 1., weights, weight_offset_ * g,
-				(Dtype*) subTopMem, top_offset_opt * g,
-				(Dtype) 0., (Dtype*) transMem, col_offset_ * g);
+				> (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
+						(Dtype) 1., weights, weight_offset_ * g,
+						(Dtype*) subTopMem, top_offset_opt * g,
+						(Dtype) 0., (Dtype*) transMem, col_offset_ * g);
 	}
 #ifdef multiQ
 	if(group_ ==2) {
@@ -405,14 +407,14 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 		conv_col2im_gpu_opt(input);
 	} else {
 		caffe_gpu_memcpy(
-			height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
-			(Dtype*) transMem, input);
+				height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
+				(Dtype*) transMem, input);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
-	const Dtype* output, Dtype* weights) {
+		const Dtype* output, Dtype* weights) {
 	const Dtype* col_buff = input;
 	if (!is_1x1_) {
 		conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
@@ -420,25 +422,25 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
 	}
 	for (int g = 0; g < group_; ++g) {
 		caffe_gpu_gemm < Dtype
-			> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
-				/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
-				(Dtype) 1., output, top_offset_,
-				(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
-				(Dtype*) weights, weight_offset_ * g);
+				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
+						/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
+						(Dtype) 1., output, top_offset_,
+						(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
+						(Dtype*) weights, weight_offset_ * g);
 	}
 }
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
-	const Dtype* output, Dtype* weights) {
+		const Dtype* output, Dtype* weights) {
 	cl_command_queue Queue;
 	if (!is_1x1_) {
 		conv_im2col_gpu_opt(input);
 	} else {
 		caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input,
-			(Dtype*) transMem);
+				(Dtype*) transMem);
 	}
 	opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
-		opt_num2);
+			opt_num2);
 
 	for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
@@ -448,10 +450,10 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 		Queue = amdDevice.CommandQueue;
 #endif
 		caffe_gpu_gemm < Dtype
-			> (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
-				(Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g,
-				(Dtype*) transMem, col_offset_ * g, (Dtype) 1.,
-				(Dtype*) weights, weight_offset_ * g);
+				> (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
+						(Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g,
+						(Dtype*) transMem, col_offset_ * g, (Dtype) 1.,
+						(Dtype*) weights, weight_offset_ * g);
 #ifdef multiQ
 		if(group_ == 2) {
 			clFinish(amdDevice.CommandQueue);
@@ -461,14 +463,14 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
-	const Dtype* input) {
+		const Dtype* input) {
 	caffe_gpu_gemv < Dtype
-		> (CblasNoTrans, num_output_, N_,
-			(Dtype) 1., input, top_offset_, N_,
-			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
-			bias, (size_t) 0, 1);
+			> (CblasNoTrans, num_output_, N_,
+					(Dtype) 1., input, top_offset_, N_,
+					reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
+					bias, (size_t) 0, 1);
 }
 
 #endif  // !CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index f9a80979..b0c0ebf2 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -7,30 +7,31 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 BaseDataLayer<Dtype>::BaseDataLayer(const LayerParameter& param)
-	: Layer<Dtype>(param),
-		transform_param_(param.transform_param()) {
+:
+		Layer<Dtype>(param),
+				transform_param_(param.transform_param()) {
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	if (top.size() == 1) {
 		output_labels_ = false;
 	} else {
 		output_labels_ = true;
 	}
 	data_transformer_.reset(
-		new DataTransformer<Dtype>(transform_param_, this->phase_));
+			new DataTransformer<Dtype>(transform_param_, this->phase_));
 	data_transformer_->InitRand();
 	// The subclasses should setup the size of bottom and top
 	DataLayerSetUp(bottom, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	BaseDataLayer < Dtype > ::LayerSetUp(bottom, top);
 	// Now, start the prefetch thread. Before calling prefetch, we make two
 	// cpu_data calls so that the prefetch thread does not accidentally make
@@ -45,20 +46,20 @@ void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
 	DLOG(INFO) << "Prefetch initialized.";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::CreatePrefetchThread() {
 	this->data_transformer_->InitRand();
 	CHECK(StartInternalThread()) << "Thread execution failed";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::JoinPrefetchThread() {
 	CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	// First, join the thread
 	JoinPrefetchThread();
 
@@ -67,43 +68,44 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
 	top[0]->ReshapeLike(prefetch_data_);
 	// Copy the data
 	caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-		top[0]->mutable_cpu_data());
+			top[0]->mutable_cpu_data());
 	DLOG(INFO) << "Prefetch copied";
 	if (this->output_labels_) {
 		// Reshape to loaded labels.
 		top[1]->ReshapeLike(prefetch_label_);
 		// Copy the labels.
 		caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-			top[1]->mutable_cpu_data());
+				top[1]->mutable_cpu_data());
 	}
 	// Start a new prefetch thread
 	DLOG(INFO) << "CreatePrefetchThread";
 	CreatePrefetchThread();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-	const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top) {
 
 	JoinPrefetchThread();
 	DLOG(INFO) << "Thread joined";
 
 	top[0]->ReshapeLike(this->prefetch_data_);
 	OCL_CHECK(
-		clEnqueueWriteBuffer(amdDevice.CommandQueue,
-			(cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
-			sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
-			NULL, NULL));
+			clEnqueueWriteBuffer(amdDevice.CommandQueue,
+					(cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
+					sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
+					NULL, NULL));
 	DLOG(INFO) << "Prefetch copied";
 	if (this->output_labels_) {
 		// Reshape to loaded labels.
 		top[1]->ReshapeLike(prefetch_label_);
 		OCL_CHECK(
-			clEnqueueWriteBuffer(amdDevice.CommandQueue,
-				(cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
-				sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), 0,
-				NULL, NULL));
+				clEnqueueWriteBuffer(amdDevice.CommandQueue,
+						(cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
+						sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(),
+						0,
+						NULL, NULL));
 	}
 
 #ifdef Track_data_transfer
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 8f72f41b..11b78a15 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -8,24 +8,24 @@ namespace caffe {
 
 const float kBNLL_THRESHOLD = 50.;
 
-template<typename Dtype>
+template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
 	for (int i = 0; i < count; ++i) {
 		top_data[i] =
-			bottom_data[i] > 0 ?
-														bottom_data[i] + log(1. + exp(-bottom_data[i])) :
-														log(1. + exp(bottom_data[i]));
+				bottom_data[i] > 0 ?
+															bottom_data[i] + log(1. + exp(-bottom_data[i])) :
+															log(1. + exp(bottom_data[i]));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->cpu_data();
 		const Dtype* top_diff = top[0]->cpu_diff();
@@ -39,9 +39,9 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -49,10 +49,10 @@ void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	BNLLForward(count, bottom_data, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->gpu_data();
 		const Dtype* top_diff = top[0]->gpu_diff();
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index b885d9e6..7d55ef40 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -6,17 +6,17 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const ConcatParameter& concat_param = this->layer_param_.concat_param();
 	CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
-		<< "Either axis or concat_dim should be specified; not both.";
+			<< "Either axis or concat_dim should be specified; not both.";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int num_axes = bottom[0]->num_axes();
 	const ConcatParameter& concat_param = this->layer_param_.concat_param();
 	if (concat_param.has_concat_dim()) {
@@ -24,8 +24,8 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		// Don't allow negative indexing for concat_dim, a uint32 -- almost
 		// certainly unintended.
 		CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
-			<< "produced negative result; concat_dim must satisfy "
-			<< "0 <= concat_dim < " << kMaxBlobAxes;
+				<< "produced negative result; concat_dim must satisfy "
+				<< "0 <= concat_dim < " << kMaxBlobAxes;
 		CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
 	} else {
 		concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
@@ -37,13 +37,13 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	int bottom_count_sum = bottom[0]->count();
 	for (int i = 1; i < bottom.size(); ++i) {
 		CHECK_EQ(num_axes, bottom[i]->num_axes())
-			<< "All inputs must have the same #axes.";
+				<< "All inputs must have the same #axes.";
 		for (int j = 0; j < num_axes; ++j) {
 			if (j == concat_axis_) {
 				continue;
 			}
 			CHECK_EQ(top_shape[j], bottom[i]->shape(j))
-				<< "All inputs must have the same shape, except at concat_axis.";
+					<< "All inputs must have the same shape, except at concat_axis.";
 		}
 		bottom_count_sum += bottom[i]->count();
 		top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
@@ -52,9 +52,9 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	CHECK_EQ(bottom_count_sum, top[0]->count());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	int offset_concat_axis = 0;
 	const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -63,17 +63,17 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
 		for (int n = 0; n < num_concats_; ++n) {
 			caffe_copy(bottom_concat_axis * concat_input_size_,
-				bottom_data + n * bottom_concat_axis * concat_input_size_,
-				top_data + (n * top_concat_axis + offset_concat_axis)
-					* concat_input_size_);
+					bottom_data + n * bottom_concat_axis * concat_input_size_,
+					top_data + (n * top_concat_axis + offset_concat_axis)
+							* concat_input_size_);
 		}
 		offset_concat_axis += bottom_concat_axis;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->cpu_diff();
 	int offset_concat_axis = 0;
 	const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -85,16 +85,16 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
 		for (int n = 0; n < num_concats_; ++n) {
 			caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
-				(n * top_concat_axis + offset_concat_axis) * concat_input_size_,
-				bottom_diff + n * bottom_concat_axis * concat_input_size_);
+					(n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+					bottom_diff + n * bottom_concat_axis * concat_input_size_);
 		}
 		offset_concat_axis += bottom_concat_axis;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	if (bottom.size() == 1) {
 		return;
 	}
@@ -108,14 +108,14 @@ void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
 		const int nthreads = bottom_concat_size * num_concats_;
 		Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-			top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
+				top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
 		offset_concat_axis += bottom_concat_axis;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (bottom.size() == 1) {
 		return;
 	}
@@ -130,7 +130,7 @@ void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 			const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
 			const int nthreads = bottom_concat_size * num_concats_;
 			Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-				top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+					top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
 		}
 		offset_concat_axis += bottom_concat_axis;
 	}
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 9c3f38d5..6a91fdfd 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -8,9 +8,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::LayerSetUp(bottom, top);
 	CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
 	CHECK_EQ(bottom[0]->height(), 1);
@@ -29,24 +29,24 @@ void ContrastiveLossLayer<Dtype>::LayerSetUp(
 		summer_vec_.mutable_cpu_data()[i] = Dtype(1);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top) {
 	int count = bottom[0]->count();
 	caffe_sub(
-		count,
-		bottom[0]->cpu_data(),  // a
-		bottom[1]->cpu_data(),  // b
-		diff_.mutable_cpu_data());  // a_i-b_i
+			count,
+			bottom[0]->cpu_data(),  // a
+			bottom[1]->cpu_data(),  // b
+			diff_.mutable_cpu_data());  // a_i-b_i
 	const int channels = bottom[0]->channels();
 	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
 	bool legacy_version =
-		this->layer_param_.contrastive_loss_param().legacy_version();
+			this->layer_param_.contrastive_loss_param().legacy_version();
 	Dtype loss(0.0);
 	for (int i = 0; i < bottom[0]->num(); ++i) {
 		dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
-			diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
+				diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
 		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
 			loss += dist_sq_.cpu_data()[i];
 		} else {  // dissimilar pairs
@@ -62,28 +62,28 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
 	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
 	bool legacy_version =
-		this->layer_param_.contrastive_loss_param().legacy_version();
+			this->layer_param_.contrastive_loss_param().legacy_version();
 	for (int i = 0; i < 2; ++i) {
 		if (propagate_down[i]) {
 			const Dtype sign = (i == 0) ? 1 : -1;
 			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-				static_cast<Dtype>(bottom[i]->num());
+					static_cast<Dtype>(bottom[i]->num());
 			int num = bottom[i]->num();
 			int channels = bottom[i]->channels();
 			for (int j = 0; j < num; ++j) {
 				Dtype* bout = bottom[i]->mutable_cpu_diff();
 				if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
 					caffe_cpu_axpby(
-						channels,
-						alpha,
-						diff_.cpu_data() + (j * channels),
-						Dtype(0.0),
-						bout + (j * channels));
+							channels,
+							alpha,
+							diff_.cpu_data() + (j * channels),
+							Dtype(0.0),
+							bout + (j * channels));
 				} else {  // dissimilar pairs
 					Dtype mdist(0.0);
 					Dtype beta(0.0);
@@ -97,11 +97,11 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 					}
 					if (mdist > Dtype(0.0)) {
 						caffe_cpu_axpby(
-							channels,
-							beta,
-							diff_.cpu_data() + (j * channels),
-							Dtype(0.0),
-							bout + (j * channels));
+								channels,
+								beta,
+								diff_.cpu_data() + (j * channels),
+								Dtype(0.0),
+								bout + (j * channels));
 					} else {
 						caffe_set(channels, Dtype(0), bout + (j * channels));
 					}
@@ -111,32 +111,32 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	const int count = bottom[0]->count();
 	caffe_gpu_sub(
-		count,
-		bottom[0]->gpu_data(),  // a
-		bottom[1]->gpu_data(),  // b
-		diff_.mutable_gpu_data());  // a_i-b_i
+			count,
+			bottom[0]->gpu_data(),  // a
+			bottom[1]->gpu_data(),  // b
+			diff_.mutable_gpu_data());  // a_i-b_i
 	caffe_gpu_powx(
-		count,
-		diff_.mutable_gpu_data(),  // a_i-b_i
-		Dtype(2),
-		diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
+			count,
+			diff_.mutable_gpu_data(),  // a_i-b_i
+			Dtype(2),
+			diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
 	caffe_gpu_gemv(
-		CblasNoTrans,
-		bottom[0]->num(),
-		bottom[0]->channels(),
-		Dtype(1.0),
-		diff_sq_.gpu_data(),  // (a_i-b_i)^2
-		summer_vec_.gpu_data(),
-		Dtype(0.0),
-		dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
+			CblasNoTrans,
+			bottom[0]->num(),
+			bottom[0]->channels(),
+			Dtype(1.0),
+			diff_sq_.gpu_data(),  // (a_i-b_i)^2
+			summer_vec_.gpu_data(),
+			Dtype(0.0),
+			dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
 	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
 	bool legacy_version =
-		this->layer_param_.contrastive_loss_param().legacy_version();
+			this->layer_param_.contrastive_loss_param().legacy_version();
 	Dtype loss(0.0);
 	for (int i = 0; i < bottom[0]->num(); ++i) {
 		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
@@ -154,25 +154,25 @@ void ContrastiveLossLayer<Dtype>::Forward_gpu(
 	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	for (int i = 0; i < 2; ++i) {
 		if (propagate_down[i]) {
 			const int count = bottom[0]->count();
 			const int channels = bottom[0]->channels();
 			Dtype margin = this->layer_param_.contrastive_loss_param().margin();
 			const bool legacy_version =
-				this->layer_param_.contrastive_loss_param().legacy_version();
+					this->layer_param_.contrastive_loss_param().legacy_version();
 			const Dtype sign = (i == 0) ? 1 : -1;
 			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-				static_cast<Dtype>(bottom[0]->num());
+					static_cast<Dtype>(bottom[0]->num());
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			CLLBackward(count, channels, margin, legacy_version, alpha,
-				bottom[2]->gpu_data(),  // pair similarity 0 or 1
-				diff_.gpu_data(),  // the cached eltwise difference between a and b
-				dist_sq_.gpu_data(),  // the cached square distance between a and b
-				bottom[i]->mutable_gpu_diff());
+					bottom[2]->gpu_data(),  // pair similarity 0 or 1
+					diff_.gpu_data(),  // the cached eltwise difference between a and b
+					dist_sq_.gpu_data(),  // the cached square distance between a and b
+					bottom[i]->mutable_gpu_diff());
 		}
 	}
 }
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index d5ffdb9f..bbe07f37 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -7,24 +7,24 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::compute_output_shape() {
 	this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
-		/ this->stride_h_ + 1;
+			/ this->stride_h_ + 1;
 	this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
-		/ this->stride_w_ + 1;
+			/ this->stride_w_ + 1;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* weight = this->blobs_[0]->cpu_data();
 	for (int i = 0; i < bottom.size(); ++i) {
 		const Dtype* bottom_data = bottom[i]->cpu_data();
 		Dtype* top_data = top[i]->mutable_cpu_data();
 		for (int n = 0; n < this->num_; ++n) {
 			this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-				top_data + top[i]->offset(n));
+					top_data + top[i]->offset(n));
 			if (this->bias_term_) {
 				const Dtype* bias = this->blobs_[1]->cpu_data();
 				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
@@ -35,9 +35,9 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	// CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* weight = this->blobs_[0]->cpu_data();
 	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 	for (int i = 0; i < top.size(); ++i) {
@@ -56,12 +56,12 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 				// gradient w.r.t. weight. Note that we will accumulate diffs.
 				if (this->param_propagate_down_[0]) {
 					this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
-						top_diff + top[i]->offset(n), weight_diff);
+							top_diff + top[i]->offset(n), weight_diff);
 				}
 				// gradient w.r.t. bottom data, if necessary.
 				if (propagate_down[i]) {
 					this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-						bottom_diff + bottom[i]->offset(n));
+							bottom_diff + bottom[i]->offset(n));
 				}
 			}
 		}
@@ -69,28 +69,28 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	if (use_packing_scheme && global_packing_N > 1)
 		Forward_gpu_opt2(bottom, top);
 	else
 		Forward_gpu_org(bottom, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (use_packing_scheme && global_packing_N > 1)
 		Backward_gpu_opt2(top, propagate_down, bottom);
 	else
 		Backward_gpu_org(top, propagate_down, bottom);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
-	const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	for (int i = 0; i < bottom.size(); ++i) {
 		const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -101,14 +101,14 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
 		this->weight_offset_ = this->M_ * this->K_;
 		for (int n = 0; n < this->num_; n += this->opt_num2) {
 			this->opt_num2 =
-				this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+					this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
 			//intermediate variables to pass offset
 			this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
 			this->top_offset_ = top[i]->offset(n);
 			this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
 			this->bottom_offset_ = bottom[i]->offset(n);
 			this->forward_gpu_gemm_opt(bottom_data, weight,
-				top_data);
+					top_data);
 			if (this->bias_term_) {
 				const Dtype* bias = this->blobs_[1]->gpu_data();
 				this->forward_gpu_bias_opt(top_data, bias);
@@ -121,10 +121,10 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
 
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_org(
-	const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom,
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	for (int i = 0; i < bottom.size(); ++i) {
 		const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -136,7 +136,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(
 			this->bottom_offset_ = bottom[i]->offset(n);
 			this->top_offset_ = top[i]->offset(n);
 			this->forward_gpu_gemm(bottom_data, weight,
-				top_data);
+					top_data);
 
 			if (this->bias_term_) {
 				const Dtype* bias = this->blobs_[1]->gpu_data();
@@ -149,9 +149,9 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(
 	//CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
 	for (int i = 0; i < top.size(); ++i) {
@@ -173,7 +173,9 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
 			this->opt_num2 = global_packing_N;
 			for (int n = 0; n < this->num_; n += this->opt_num2) {
 				this->opt_num2 =
-					this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+						this->opt_num2 > (this->num_ - n) ?
+																								(this->num_ - n) :
+																								this->opt_num2;
 				this->top_offset_ = top[i]->offset(n);
 				this->bottom_offset_ = bottom[i]->offset(n);
 				this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
@@ -181,21 +183,21 @@ void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
 				// gradient w.r.t. weight. Note that we will accumulate diffs.
 				if (this->param_propagate_down_[0]) {
 					this->weight_gpu_gemm_opt(bottom_data,
-						top_diff, weight_diff);
+							top_diff, weight_diff);
 				}
 				// gradient w.r.t. bottom data, if necessary.
 				if (propagate_down[i]) {
 					this->backward_gpu_gemm_opt(top_diff, weight,
-						bottom_diff);
+							bottom_diff);
 				}
 			}
 		}
 	}
 
 }
-template<typename Dtype>
+template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
 	for (int i = 0; i < top.size(); ++i) {
@@ -220,12 +222,12 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
 				// gradient w.r.t. weight. Note that we will accumulate diffs.
 				if (this->param_propagate_down_[0]) {
 					this->weight_gpu_gemm(bottom_data,
-						top_diff, weight_diff);
+							top_diff, weight_diff);
 				}
 				// gradient w.r.t. bottom data, if necessary.
 				if (propagate_down[i]) {
 					this->backward_gpu_gemm(top_diff, weight,
-						bottom_diff);
+							bottom_diff);
 				}
 			}
 		}
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index bff8b10c..e9ee5221 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -16,14 +16,14 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 DataLayer<Dtype>::~DataLayer<Dtype>() {
 	this->JoinPrefetchThread();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// Initialize DB
 	db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
 	db_->Open(this->layer_param_.data_param().source(), db::READ);
@@ -32,7 +32,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	// Check if we should randomly skip a few data points
 	if (this->layer_param_.data_param().rand_skip()) {
 		unsigned int skip = caffe_rng_rand() %
-			this->layer_param_.data_param().rand_skip();
+				this->layer_param_.data_param().rand_skip();
 		LOG(INFO) << "Skipping first " << skip << " data points.";
 		while (skip-- > 0) {
 			cursor_->Next();
@@ -51,8 +51,8 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	this->prefetch_data_.set_data_layer();
 
 	LOG(INFO) << "output data size: " << top[0]->num() << ","
-		<< top[0]->channels() << "," << top[0]->height() << ","
-		<< top[0]->width();
+			<< top[0]->channels() << "," << top[0]->height() << ","
+			<< top[0]->width();
 	// label
 	if (this->output_labels_) {
 		vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
@@ -63,7 +63,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 // This function is used to create a thread that prefetches the data.
-template<typename Dtype>
+template <typename Dtype>
 void DataLayer<Dtype>::InternalThreadEntry() {
 	CPUTimer batch_timer;
 	batch_timer.Start();
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index aa61a755..402a787e 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -8,24 +8,24 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void DeconvolutionLayer<Dtype>::compute_output_shape() {
 	this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
-		- 2 * this->pad_h_;
+			- 2 * this->pad_h_;
 	this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
-		- 2 * this->pad_w_;
+			- 2 * this->pad_w_;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* weight = this->blobs_[0]->cpu_data();
 	for (int i = 0; i < bottom.size(); ++i) {
 		const Dtype* bottom_data = bottom[i]->cpu_data();
 		Dtype* top_data = top[i]->mutable_cpu_data();
 		for (int n = 0; n < this->num_; ++n) {
 			this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-				top_data + top[i]->offset(n));
+					top_data + top[i]->offset(n));
 			if (this->bias_term_) {
 				const Dtype* bias = this->blobs_[1]->cpu_data();
 				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
@@ -34,9 +34,9 @@ void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* weight = this->blobs_[0]->cpu_data();
 	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 	for (int i = 0; i < top.size(); ++i) {
@@ -55,23 +55,23 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 				// Gradient w.r.t. weight. Note that we will accumulate diffs.
 				if (this->param_propagate_down_[0]) {
 					this->weight_cpu_gemm(top_diff + top[i]->offset(n),
-						bottom_data + bottom[i]->offset(n), weight_diff);
+							bottom_data + bottom[i]->offset(n), weight_diff);
 				}
 				// Gradient w.r.t. bottom data, if necessary, reusing the column buffer
 				// we might have just computed above.
 				if (propagate_down[i]) {
 					this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-						bottom_diff + bottom[i]->offset(n),
-						this->param_propagate_down_[0]);
+							bottom_diff + bottom[i]->offset(n),
+							this->param_propagate_down_[0]);
 				}
 			}
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	for (int i = 0; i < bottom.size(); ++i) {
 		const Dtype* bottom_data = bottom[i]->gpu_data();
@@ -88,9 +88,9 @@ void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
 	for (int i = 0; i < top.size(); ++i) {
@@ -113,12 +113,12 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 				// gradient w.r.t. weight. Note that we will accumulate diffs.
 				if (this->param_propagate_down_[0]) {
 					this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-						bottom_data + bottom[i]->offset(n), weight_diff);
+							bottom_data + bottom[i]->offset(n), weight_diff);
 				}
 				// gradient w.r.t. bottom data, if necessary.
 				if (propagate_down[i]) {
 					this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-						bottom_diff + bottom[i]->offset(n));
+							bottom_diff + bottom[i]->offset(n));
 				}
 			}
 		}
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index ae045c5c..c84c8622 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -10,19 +10,18 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::ocl_setup(int bottom_count) {
 	MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		bottom_count * sizeof(int), NULL, NULL);
+			bottom_count * sizeof(int), NULL, NULL);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 DropoutLayer<Dtype>::~DropoutLayer() {
 	OCL_CHECK (clReleaseMemObject(MaskMem) );
-	}
-template<typename Dtype>
+	}template <typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	threshold_ = this->layer_param_.dropout_param().dropout_ratio();
 	DCHECK(threshold_ > 0.);
@@ -32,18 +31,18 @@ void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	ocl_setup(bottom[0]->count());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::Reshape(bottom, top);
 	// Set up the cache for random number generation
 	rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	unsigned int* mask = rand_vec_.mutable_cpu_data();
@@ -59,10 +58,10 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_diff = top[0]->cpu_diff();
 		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -78,9 +77,9 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -95,26 +94,26 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
 		caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1.,
-			threshold_);
+				threshold_);
 		DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_,
-			top_data);
+				top_data);
 #endif
 	} else {
 		caffe_gpu_copy(count, bottom_data, top_data);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_diff = top[0]->gpu_diff();
 		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 		if (this->phase_ == TRAIN) {
 			const int count = bottom[0]->count();
 			DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_,
-				(Dtype) scale_, bottom_diff);
+					(Dtype) scale_, bottom_diff);
 		} else {
 			caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
 		}
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index 8a3fe17e..a5225ea6 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -6,39 +6,39 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int num_top = top.size();
 	const DummyDataParameter& param = this->layer_param_.dummy_data_param();
 	const int num_data_filler = param.data_filler_size();
 	CHECK(num_data_filler == 0 || num_data_filler == 1 ||
-		num_data_filler == num_top)
-		<< "Number of data fillers must be 0, 1 or equal to the number of tops: "
-		<< num_top << "; you specified " << num_data_filler << " data fillers.";
+			num_data_filler == num_top)
+			<< "Number of data fillers must be 0, 1 or equal to the number of tops: "
+			<< num_top << "; you specified " << num_data_filler << " data fillers.";
 
 	const bool legacy_dims = param.num_size() || param.channels_size() ||
-		param.height_size() || param.width_size();
+			param.height_size() || param.width_size();
 	if (legacy_dims) {
 		CHECK_EQ(0, param.shape_size())
-			<< "Both shape and legacy fields were specified";
+				<< "Both shape and legacy fields were specified";
 		// Using deprecated 4D output dim specifiers.
 		CHECK(param.num_size() == 1 || param.num_size() == num_top)
-			<< "Must specify 'num' once, or once per top blob "
-			<< "(" << num_top << "); specified " << param.num_size() << ".";
+				<< "Must specify 'num' once, or once per top blob "
+				<< "(" << num_top << "); specified " << param.num_size() << ".";
 		CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-			<< "Must specify 'channels' once, or once per top blob "
-			<< "(" << num_top << "); specified " << param.channels_size() << ".";
+				<< "Must specify 'channels' once, or once per top blob "
+				<< "(" << num_top << "); specified " << param.channels_size() << ".";
 		CHECK(param.height_size() == 1 || param.height_size() == num_top)
-			<< "Must specify 'height' once, or once per top blob "
-			<< "(" << num_top << "); specified " << param.height_size() << ".";
+				<< "Must specify 'height' once, or once per top blob "
+				<< "(" << num_top << "); specified " << param.height_size() << ".";
 		CHECK(param.width_size() == 1 || param.width_size() == num_top)
-			<< "Must specify 'width' once, or once per top blob "
-			<< "(" << num_top << "); specified " << param.width_size() << ".";
+				<< "Must specify 'width' once, or once per top blob "
+				<< "(" << num_top << "); specified " << param.width_size() << ".";
 	} else {
 		CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
-			<< "Must specify 'shape' once, or once per top blob "
-			<< "(" << num_top << "); specified " << param.shape_size() << ".";
+				<< "Must specify 'shape' once, or once per top blob "
+				<< "(" << num_top << "); specified " << param.shape_size() << ".";
 	}
 	// refill_[i] tells Forward i whether or not to actually refill top Blob i.
 	// If refill_[i] is false, Forward does nothing for Blob i. We use this to
@@ -71,18 +71,18 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 			// Refill on each iteration iff not using a constant filler,
 			// but use the inverse of this rule for the first run.
 			refill_[i] =
-				(strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
+					(strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
 		}
 	}
 	for (int i = 0; i < num_top; ++i) {
 		if (legacy_dims) {
 			const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
 			const int channels =
-				(param.channels_size() == 1) ? param.channels(0) : param.channels(i);
+					(param.channels_size() == 1) ? param.channels(0) : param.channels(i);
 			const int height =
-				(param.height_size() == 1) ? param.height(0) : param.height(i);
+					(param.height_size() == 1) ? param.height(0) : param.height(i);
 			const int width =
-				(param.width_size() == 1) ? param.width(0) : param.width(i);
+					(param.width_size() == 1) ? param.width(0) : param.width(i);
 			top[i]->Reshape(num, channels, height, width);
 		} else {
 			const int shape_index = (param.shape_size() == 1) ? 0 : i;
@@ -98,9 +98,9 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	for (int i = 0; i < top.size(); ++i) {
 		const int filler_id = (fillers_.size() > 1) ? i : 0;
 		if (refill_[filler_id]) {
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 45126d44..e7b97b0d 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -7,16 +7,16 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-		|| this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
-		"Eltwise Layer takes one coefficient per bottom blob.";
+			|| this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
+			"Eltwise Layer takes one coefficient per bottom blob.";
 	CHECK(!(this->layer_param().eltwise_param().operation()
-		== EltwiseParameter_EltwiseOp_PROD
-		&& this->layer_param().eltwise_param().coeff_size())) <<
-		"Eltwise layer only takes coefficients for summation.";
+			== EltwiseParameter_EltwiseOp_PROD
+			&& this->layer_param().eltwise_param().coeff_size())) <<
+			"Eltwise layer only takes coefficients for summation.";
 	op_ = this->layer_param_.eltwise_param().operation();
 	// Blob-wise coefficients for the elementwise operation.
 	coeffs_ = vector < Dtype > (bottom.size(), 1);
@@ -28,23 +28,23 @@ void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	for (int i = 1; i < bottom.size(); ++i) {
 		CHECK(bottom[i]->shape() == bottom[0]->shape());
 	}
 	top[0]->ReshapeLike(*bottom[0]);
 	// If max operation, we will initialize the vector index part.
 	if (this->layer_param_.eltwise_param().operation() ==
-		EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
+			EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
 		max_idx_.Reshape(bottom[0]->shape());
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	int* mask = NULL;
 	const Dtype* bottom_data_a = NULL;
 	const Dtype* bottom_data_b = NULL;
@@ -97,9 +97,9 @@ void EltwiseLayer<Dtype>::Forward_cpu(
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const int* mask = NULL;
 	const int count = top[0]->count();
 	const Dtype* top_data = top[0]->cpu_data();
@@ -121,7 +121,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 								initialized = true;
 							} else {
 								caffe_mul(count, bottom[j]->cpu_data(), bottom_diff,
-									bottom_diff);
+										bottom_diff);
 							}
 						}
 					} else {
@@ -153,16 +153,16 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int* mask = NULL;
 	const int count = top[0]->count();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	switch (op_) {
 		case EltwiseParameter_EltwiseOp_PROD:
 			caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-				top_data);
+					top_data);
 			for (int i = 2; i < bottom.size(); ++i) {
 				caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
 			}
@@ -178,11 +178,11 @@ void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 			mask = max_idx_.mutable_gpu_data();
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0,
-				top_data, mask);
+					top_data, mask);
 			for (int i = 2; i < bottom.size(); ++i) {
 				// NOLINT_NEXT_LINE(whitespace/operators)
 				MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data,
-					mask);
+						mask);
 			}
 			break;
 		default:
@@ -190,9 +190,9 @@ void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const int* mask = NULL;
 	const int count = top[0]->count();
 	const Dtype* top_data = top[0]->gpu_data();
@@ -214,7 +214,7 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 								initialized = true;
 							} else {
 								caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-									bottom_diff);
+										bottom_diff);
 							}
 						}
 					} else {
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index d5abc23f..56dc48ec 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -7,74 +7,74 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::Reshape(bottom, top);
 	CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
-		<< "Inputs must have the same dimension.";
+			<< "Inputs must have the same dimension.";
 	diff_.ReshapeLike(*bottom[0]);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int count = bottom[0]->count();
 	caffe_sub(
-		count,
-		bottom[0]->cpu_data(),
-		bottom[1]->cpu_data(),
-		diff_.mutable_cpu_data());
+			count,
+			bottom[0]->cpu_data(),
+			bottom[1]->cpu_data(),
+			diff_.mutable_cpu_data());
 	Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
 	Dtype loss = dot / bottom[0]->num() / Dtype(2);
 	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	for (int i = 0; i < 2; ++i) {
 		if (propagate_down[i]) {
 			const Dtype sign = (i == 0) ? 1 : -1;
 			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
 			caffe_cpu_axpby(
-				bottom[i]->count(),              // count
-				alpha,                              // alpha
-				diff_.cpu_data(),                   // a
-				Dtype(0),                           // beta
-				bottom[i]->mutable_cpu_diff());  // b
+					bottom[i]->count(),              // count
+					alpha,                              // alpha
+					diff_.cpu_data(),                   // a
+					Dtype(0),                           // beta
+					bottom[i]->mutable_cpu_diff());  // b
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int count = bottom[0]->count();
 	caffe_gpu_sub(
-		count,
-		bottom[0]->gpu_data(),
-		bottom[1]->gpu_data(),
-		diff_.mutable_gpu_data());
+			count,
+			bottom[0]->gpu_data(),
+			bottom[1]->gpu_data(),
+			diff_.mutable_gpu_data());
 	Dtype dot;
 	caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
 	Dtype loss = dot / bottom[0]->num() / Dtype(2);
 	top[0]->mutable_cpu_data()[0] = loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	for (int i = 0; i < 2; ++i) {
 		if (propagate_down[i]) {
 			const Dtype sign = (i == 0) ? 1 : -1;
 			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
 			caffe_gpu_axpby(
-				bottom[i]->count(),              // count
-				alpha,                              // alpha
-				diff_.gpu_data(),                   // a
-				Dtype(0),                           // beta
-				bottom[i]->mutable_gpu_diff());  // b
+					bottom[i]->count(),              // count
+					alpha,                              // alpha
+					diff_.gpu_data(),                   // a
+					Dtype(0),                           // beta
+					bottom[i]->mutable_gpu_diff());  // b
 		}
 	}
 }
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 8451b133..bf783786 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -7,9 +7,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	const Dtype base = this->layer_param_.exp_param().base();
 	if (base != Dtype(-1)) {
@@ -19,18 +19,18 @@ void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	// Otherwise, calculate its log explicitly.
 	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
 	CHECK(!isnan(log_base))
-		<< "NaN result: log(base) = log(" << base << ") = " << log_base;
+			<< "NaN result: log(base) = log(" << base << ") = " << log_base;
 	CHECK(!isinf(log_base))
-		<< "Inf result: log(base) = log(" << base << ") = " << log_base;
+			<< "Inf result: log(base) = log(" << base << ") = " << log_base;
 	const Dtype input_scale = this->layer_param_.exp_param().scale();
 	const Dtype input_shift = this->layer_param_.exp_param().shift();
 	inner_scale_ = log_base * input_scale;
 	outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int count = bottom[0]->count();
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
@@ -45,9 +45,9 @@ void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -61,9 +61,9 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int count = bottom[0]->count();
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
@@ -78,9 +78,9 @@ void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index 9fa26c80..f7096a09 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -7,26 +7,26 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(top.size(), bottom.size() - 1);
 	first_reshape_ = true;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// bottom[0...k-1] are the blobs to filter
 	// bottom[last] is the "selector_blob"
 	int selector_index = bottom.size() - 1;
 	for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) {
 		CHECK_EQ(bottom[selector_index]->shape(i), 1)
-			<< "Selector blob dimensions must be singletons (1), except the first";
+				<< "Selector blob dimensions must be singletons (1), except the first";
 	}
 	for (int i = 0; i < bottom.size() - 1; ++i) {
 		CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) <<
-			"Each bottom should have the same 0th dimension as the selector blob";
+				"Each bottom should have the same 0th dimension as the selector blob";
 	}
 
 	const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
@@ -59,9 +59,9 @@ void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int new_tops_num = indices_to_forward_.size();
 	// forward all filtered items for all bottoms but the Selector (bottom[last])
 	for (int t = 0; t < top.size(); ++t) {
@@ -72,17 +72,17 @@ void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 			int data_offset_top = n * dim;
 			int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1);
 			caffe_copy(dim, bottom_data + data_offset_bottom,
-				top_data + data_offset_top);
+					top_data + data_offset_top);
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[bottom.size() - 1]) {
 		LOG(FATAL) << this->type()
-			<< "Layer cannot backpropagate to filter index inputs";
+				<< "Layer cannot backpropagate to filter index inputs";
 	}
 	for (int i = 0; i < top.size(); i++) {
 		// bottom[last] is the selector and never needs backpropagation
@@ -99,17 +99,17 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 					// we already visited all items that were been forwarded, so
 					// just set to zero remaining ones
 					caffe_set(dim, Dtype(0),
-						bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
 				} else {
 					batch_offset = indices_to_forward_[next_to_backward_offset];
 					if (n != batch_offset) {  // this data was not been forwarded
 						caffe_set(dim, Dtype(0),
-							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+								bottom[i]->mutable_cpu_diff() + data_offset_bottom);
 					} else {  // this data was been forwarded
 						data_offset_top = next_to_backward_offset * dim;
 						next_to_backward_offset++;  // point to next forwarded item index
 						caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top,
-							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+								bottom[i]->mutable_cpu_diff() + data_offset_bottom);
 					}
 				}
 			}
@@ -117,9 +117,9 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int new_tops_num = indices_to_forward_.size();
 	// forward all filtered items for all bottoms but the Selector (bottom[last])
 	for (int t = 0; t < top.size(); ++t) {
@@ -130,17 +130,17 @@ void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 			int data_offset_top = n * dim;
 			int data_offset_bottom = indices_to_forward_[n] * dim;
 			caffe_copy(dim, bottom_data + data_offset_bottom,
-				top_data + data_offset_top);
+					top_data + data_offset_top);
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[bottom.size() - 1]) {
 		LOG(FATAL) << this->type()
-			<< "Layer cannot backpropagate to filter index inputs";
+				<< "Layer cannot backpropagate to filter index inputs";
 	}
 	for (int i = 0; i < top.size(); ++i) {
 		// bottom[last] is the selector and never needs backpropagation
@@ -157,18 +157,18 @@ void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 					// just set to zero remaining ones
 					data_offset_bottom = n * dim;
 					caffe_gpu_set(dim, Dtype(0),
-						bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
 				} else {
 					batch_offset = indices_to_forward_[next_to_backward_offset];
 					data_offset_bottom = n * dim;
 					if (n != batch_offset) {  // this data was not been forwarded
 						caffe_gpu_set(dim, Dtype(0),
-							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+								bottom[i]->mutable_gpu_diff() + data_offset_bottom);
 					} else {  // this data was been forwarded
 						data_offset_top = next_to_backward_offset * dim;
 						++next_to_backward_offset;  // point to next forwarded item index
 						caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+								bottom[i]->mutable_gpu_diff() + data_offset_bottom);
 					}
 				}
 			}
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 4aaad3a4..e79e9406 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -6,13 +6,13 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int start_axis = bottom[0]->CanonicalAxisIndex(
-		this->layer_param_.flatten_param().axis());
+			this->layer_param_.flatten_param().axis());
 	const int end_axis = bottom[0]->CanonicalAxisIndex(
-		this->layer_param_.flatten_param().end_axis());
+			this->layer_param_.flatten_param().end_axis());
 	vector<int> top_shape;
 	for (int i = 0; i < start_axis; ++i) {
 		top_shape.push_back(bottom[0]->shape(i));
@@ -26,15 +26,15 @@ void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	CHECK_EQ(top[0]->count(), bottom[0]->count());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	top[0]->ShareData(*bottom[0]);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	bottom[0]->ShareDiff(*top[0]);
 }
 
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 377755b9..6f67dc06 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -20,12 +20,12 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() {
 }
 
 // Load data and label from HDF5 filename into the class property blobs.
-template<typename Dtype>
+template <typename Dtype>
 void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 	DLOG(INFO) << "Loading HDF5 file: " << filename;
 	hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
@@ -42,7 +42,7 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 	for (int i = 0; i < top_size; ++i) {
 		hdf_blobs_[i] = shared_ptr < Blob<Dtype> > (new Blob<Dtype>());
 		hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-			MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+				MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
 	}
 
 	herr_t status = H5Fclose(file_id);
@@ -64,18 +64,18 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 	if (this->layer_param_.hdf5_data_param().shuffle()) {
 		std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
 		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
-			<< " rows (shuffled)";
+				<< " rows (shuffled)";
 	} else {
 		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// Refuse transformation parameters since HDF5 is totally generic.
 	CHECK(!this->layer_param_.has_transform_param()) <<
-		this->type() << " does not transform data.";
+			this->type() << " does not transform data.";
 	// Read the source to parse the filenames.
 	const string& source = this->layer_param_.hdf5_data_param().source();
 	LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
@@ -94,7 +94,7 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	current_file_ = 0;
 	LOG(INFO) << "Number of HDF5 files: " << num_files_;
 	CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
-		<< source;
+			<< source;
 
 	file_permutation_.clear();
 	file_permutation_.resize(num_files_);
@@ -126,9 +126,9 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
 	for (int i = 0; i < batch_size; ++i, ++current_row_) {
 		if (current_row_ == hdf_blobs_[0]->shape(0)) {
@@ -138,12 +138,12 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 					current_file_ = 0;
 					if (this->layer_param_.hdf5_data_param().shuffle()) {
 						std::random_shuffle(file_permutation_.begin(),
-							file_permutation_.end());
+								file_permutation_.end());
 					}
 					DLOG(INFO) << "Looping around to first file.";
 				}
 				LoadHDF5FileData(
-					hdf_filenames_[file_permutation_[current_file_]].c_str());
+						hdf_filenames_[file_permutation_[current_file_]].c_str());
 			}
 			current_row_ = 0;
 			if (this->layer_param_.hdf5_data_param().shuffle())
@@ -152,15 +152,15 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
 			int data_dim = top[j]->count() / top[j]->shape(0);
 			caffe_copy(data_dim,
-				&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-					* data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
+					&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+							* data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
 	for (int i = 0; i < batch_size; ++i, ++current_row_) {
 		if (current_row_ == hdf_blobs_[0]->shape(0)) {
@@ -170,12 +170,12 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 					current_file_ = 0;
 					if (this->layer_param_.hdf5_data_param().shuffle()) {
 						std::random_shuffle(file_permutation_.begin(),
-							file_permutation_.end());
+								file_permutation_.end());
 					}
 					DLOG(INFO) << "Looping around to first file.";
 				}
 				LoadHDF5FileData(
-					hdf_filenames_[file_permutation_[current_file_]].c_str());
+						hdf_filenames_[file_permutation_[current_file_]].c_str());
 			}
 			current_row_ = 0;
 			if (this->layer_param_.hdf5_data_param().shuffle())
@@ -184,11 +184,12 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
 			int data_dim = top[j]->count() / top[j]->shape(0);
 			OCL_CHECK(
-				clEnqueueWriteBuffer(amdDevice.CommandQueue,
-					(cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
-					i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
-					&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim],
-					0, NULL, NULL));
+					clEnqueueWriteBuffer(amdDevice.CommandQueue,
+							(cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
+							i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
+							&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+									* data_dim],
+							0, NULL, NULL));
 			//caffe_copy(data_dim,
 			//    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
 			//      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index cbb8a6fe..baad0dea 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -11,17 +11,17 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	file_name_ = this->layer_param_.hdf5_output_param().file_name();
 	file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
-		H5P_DEFAULT);
+			H5P_DEFAULT);
 	CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
 	file_opened_ = true;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 HDF5OutputLayer<Dtype>::~HDF5OutputLayer<Dtype>() {
 	if (file_opened_) {
 		herr_t status = H5Fclose(file_id_);
@@ -29,74 +29,76 @@ HDF5OutputLayer<Dtype>::~HDF5OutputLayer<Dtype>() {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::SaveBlobs() {
 	// TODO: no limit on the number of blobs
 	LOG(INFO) << "Saving HDF5 file " << file_name_;
 	CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
-		"data blob and label blob must have the same batch size";
+			"data blob and label blob must have the same batch size";
 	hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
 	hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
 	LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_GE(bottom.size(), 2);
 	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
 	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-		bottom[1]->height(), bottom[1]->width());
+			bottom[1]->height(), bottom[1]->width());
 	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
 	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
 	for (int i = 0; i < bottom[0]->num(); ++i) {
 		caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim],
-			&data_blob_.mutable_cpu_data()[i * data_datum_dim]);
+				&data_blob_.mutable_cpu_data()[i * data_datum_dim]);
 		caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim],
-			&label_blob_.mutable_cpu_data()[i * label_datum_dim]);
+				&label_blob_.mutable_cpu_data()[i * label_datum_dim]);
 	}
 	SaveBlobs();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	return;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_GE(bottom.size(), 2);
 	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
 	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-		bottom[1]->height(), bottom[1]->width());
+			bottom[1]->height(), bottom[1]->width());
 	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
 	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
 	for (int i = 0; i < bottom[0]->num(); ++i) {
 		OCL_CHECK(
-			clEnqueueReadBuffer(amdDevice.CommandQueue,
-				(cl_mem) bottom[0]->gpu_data(), CL_TRUE,
-				i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
-				&data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
+				clEnqueueReadBuffer(amdDevice.CommandQueue,
+						(cl_mem) bottom[0]->gpu_data(), CL_TRUE,
+						i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
+						&data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
 		OCL_CHECK(
-			clEnqueueReadBuffer(amdDevice.CommandQueue,
-				(cl_mem) bottom[1]->gpu_data(), CL_TRUE,
-				i * label_datum_dim * sizeof(Dtype), sizeof(Dtype) * label_datum_dim,
-				&label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, NULL));
+				clEnqueueReadBuffer(amdDevice.CommandQueue,
+						(cl_mem) bottom[1]->gpu_data(), CL_TRUE,
+						i * label_datum_dim * sizeof(Dtype),
+						sizeof(Dtype) * label_datum_dim,
+						&label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL,
+						NULL));
 	}
 	SaveBlobs();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	return;
 }
 
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index e01e1d6a..d415bd64 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -10,9 +10,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
 	const Dtype* label = bottom[1]->cpu_data();
@@ -27,7 +27,7 @@ void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	for (int i = 0; i < num; ++i) {
 		for (int j = 0; j < dim; ++j) {
 			bottom_diff[i * dim + j] = std::max(
-				Dtype(0), 1 + bottom_diff[i * dim + j]);
+					Dtype(0), 1 + bottom_diff[i * dim + j]);
 		}
 	}
 	Dtype* loss = top[0]->mutable_cpu_data();
@@ -43,12 +43,12 @@ void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index b29e47e2..a8ddc7fe 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -7,24 +7,24 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
 	CHECK(!conv_param.has_kernel_size() !=
-		!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-		<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+			!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
 	CHECK(conv_param.has_kernel_size() ||
-		(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-		<< "For non-square filters both kernel_h and kernel_w are required.";
+			(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+			<< "For non-square filters both kernel_h and kernel_w are required.";
 	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-		&& conv_param.has_pad_w())
-		|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-		<< "pad is pad OR pad_h and pad_w are required.";
+			&& conv_param.has_pad_w())
+			|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+			<< "pad is pad OR pad_h and pad_w are required.";
 	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-		&& conv_param.has_stride_w())
-		|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-		<< "Stride is stride OR stride_h and stride_w are required.";
+			&& conv_param.has_stride_w())
+			|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+			<< "Stride is stride OR stride_h and stride_w are required.";
 	if (conv_param.has_kernel_size()) {
 		kernel_h_ = kernel_w_ = conv_param.kernel_size();
 	} else {
@@ -47,65 +47,65 @@ void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
+			<< "corresponding to (num, channels, height, width)";
 	channels_ = bottom[0]->channels();
 	height_ = bottom[0]->height();
 	width_ = bottom[0]->width();
 	top[0]->Reshape(
-		bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
-		(height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
-		(width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
+			bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+			(height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+			(width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	for (int n = 0; n < bottom[0]->num(); ++n) {
 		im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-			width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-			stride_h_, stride_w_, top_data + top[0]->offset(n));
+				width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+				stride_h_, stride_w_, top_data + top[0]->offset(n));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->cpu_diff();
 	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
 	for (int n = 0; n < top[0]->num(); ++n) {
 		col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-			kernel_h_, kernel_w_, pad_h_, pad_w_,
-			stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
+				kernel_h_, kernel_w_, pad_h_, pad_w_,
+				stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	for (int n = 0; n < bottom[0]->num(); ++n) {
 		im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_,
-			width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-			stride_h_, stride_w_, top_data, top[0]->offset(n));
+				width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+				stride_h_, stride_w_, top_data, top[0]->offset(n));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->gpu_diff();
 	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 	for (int n = 0; n < top[0]->num(); ++n) {
 		col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
-			kernel_h_, kernel_w_, pad_h_, pad_w_,
-			stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
+				kernel_h_, kernel_w_, pad_h_, pad_w_,
+				stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
 	}
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 846bcc34..24ac8ffc 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -15,22 +15,22 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
 	this->JoinPrefetchThread();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int new_height = this->layer_param_.image_data_param().new_height();
 	const int new_width = this->layer_param_.image_data_param().new_width();
 	const bool is_color = this->layer_param_.image_data_param().is_color();
 	string root_folder = this->layer_param_.image_data_param().root_folder();
 
 	CHECK((new_height == 0 && new_width == 0) ||
-		(new_height > 0 && new_width > 0)) << "Current implementation requires "
-		"new_height and new_width to be set at the same time.";
+			(new_height > 0 && new_width > 0)) << "Current implementation requires "
+			"new_height and new_width to be set at the same time.";
 	// Read the file with filenames and labels
 	const string& source = this->layer_param_.image_data_param().source();
 	LOG(INFO) << "Opening file " << source;
@@ -54,14 +54,14 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	// Check if we would need to randomly skip a few data points
 	if (this->layer_param_.image_data_param().rand_skip()) {
 		unsigned int skip = caffe_rng_rand() %
-			this->layer_param_.image_data_param().rand_skip();
+				this->layer_param_.image_data_param().rand_skip();
 		LOG(INFO) << "Skipping first " << skip << " data points.";
 		CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
 		lines_id_ = skip;
 	}
 	// Read an image, and use it to initialize the top blob.
 	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-		new_height, new_width, is_color);
+			new_height, new_width, is_color);
 	// Use data_transformer to infer the expected blob shape from a cv_image.
 	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
 	this->transformed_data_.Reshape(top_shape);
@@ -72,23 +72,23 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	top[0]->ReshapeLike(this->prefetch_data_);
 
 	LOG(INFO) << "output data size: " << top[0]->num() << ","
-		<< top[0]->channels() << "," << top[0]->height() << ","
-		<< top[0]->width();
+			<< top[0]->channels() << "," << top[0]->height() << ","
+			<< top[0]->width();
 	// label
 	vector<int> label_shape(1, batch_size);
 	top[1]->Reshape(label_shape);
 	this->prefetch_label_.Reshape(label_shape);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ImageDataLayer<Dtype>::ShuffleImages() {
 	caffe::rng_t* prefetch_rng =
-		static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+			static_cast<caffe::rng_t*>(prefetch_rng_->generator());
 	shuffle(lines_.begin(), lines_.end(), prefetch_rng);
 }
 
 // This function is used to create a thread that prefetches the data.
-template<typename Dtype>
+template <typename Dtype>
 void ImageDataLayer<Dtype>::InternalThreadEntry() {
 	CPUTimer batch_timer;
 	batch_timer.Start();
@@ -107,7 +107,7 @@ void ImageDataLayer<Dtype>::InternalThreadEntry() {
 	// Reshape according to the first image of each batch
 	// on single input batches allows for inputs of varying dimension.
 	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-		new_height, new_width, is_color);
+			new_height, new_width, is_color);
 	// Use data_transformer to infer the expected blob shape from a cv_img.
 	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
 	this->transformed_data_.Reshape(top_shape);
@@ -125,7 +125,7 @@ void ImageDataLayer<Dtype>::InternalThreadEntry() {
 		timer.Start();
 		CHECK_GT(lines_size, lines_id_);
 		cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-			new_height, new_width, is_color);
+				new_height, new_width, is_color);
 		CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
 		read_time += timer.MicroSeconds();
 		timer.Start();
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index e5294a7e..21414224 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -10,23 +10,23 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::LayerSetUp(bottom, top);
 	if (bottom.size() < 3) {
 		CHECK(this->layer_param_.infogain_loss_param().has_source())
-			<< "Infogain matrix source must be specified.";
+				<< "Infogain matrix source must be specified.";
 		BlobProto blob_proto;
 		ReadProtoFromBinaryFile(
-			this->layer_param_.infogain_loss_param().source(), &blob_proto);
+				this->layer_param_.infogain_loss_param().source(), &blob_proto);
 		infogain_.FromProto(blob_proto);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::Reshape(bottom, top);
 	Blob < Dtype > *infogain = NULL;
 	if (bottom.size() < 3) {
@@ -45,9 +45,9 @@ void InfogainLossLayer<Dtype>::Reshape(
 	CHECK_EQ(infogain->width(), dim);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const Dtype* bottom_label = bottom[1]->cpu_data();
 	const Dtype* infogain_mat = NULL;
@@ -69,17 +69,17 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down.size() > 2 && propagate_down[2]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to infogain inputs.";
+				<< " Layer cannot backpropagate to infogain inputs.";
 	}
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->cpu_data();
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index e563aa21..3beca42f 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -9,14 +9,14 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int num_output = this->layer_param_.inner_product_param().num_output();
 	bias_term_ = this->layer_param_.inner_product_param().bias_term();
 	N_ = num_output;
 	const int axis = bottom[0]->CanonicalAxisIndex(
-		this->layer_param_.inner_product_param().axis());
+			this->layer_param_.inner_product_param().axis());
 	// Dimensions starting from "axis" are "flattened" into a single
 	// length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
 	// and axis == 1, N inner products with dimension CHW are performed.
@@ -37,29 +37,29 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 		this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
 		// fill the weights
 		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
-			this->layer_param_.inner_product_param().weight_filler()));
+				this->layer_param_.inner_product_param().weight_filler()));
 		weight_filler->Fill(this->blobs_[0].get());
 		// If necessary, intiialize and fill the bias term
 		if (bias_term_) {
 			vector<int> bias_shape(1, N_);
 			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
 			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
-				this->layer_param_.inner_product_param().bias_filler()));
+					this->layer_param_.inner_product_param().bias_filler()));
 			bias_filler->Fill(this->blobs_[1].get());
 		}
 	}  // parameter initialization
 	this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// Figure out the dimensions
 	const int axis = bottom[0]->CanonicalAxisIndex(
-		this->layer_param_.inner_product_param().axis());
+			this->layer_param_.inner_product_param().axis());
 	const int new_K = bottom[0]->count(axis);
 	CHECK_EQ(K_, new_K)
-		<< "Input size incompatible with inner product parameters.";
+			<< "Input size incompatible with inner product parameters.";
 	// The first "axis" dimensions are independent inner products; the total
 	// number of these is M_, the product over these dimensions.
 	M_ = bottom[0]->count(0, axis);
@@ -77,92 +77,92 @@ void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const Dtype* weight = this->blobs_[0]->cpu_data();
 	caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
-		bottom_data, weight, (Dtype) 0., top_data);
+			bottom_data, weight, (Dtype) 0., top_data);
 	if (bias_term_) {
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1.,
-			bias_multiplier_.cpu_data(),
-			this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
+				bias_multiplier_.cpu_data(),
+				this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (this->param_propagate_down_[0]) {
 		const Dtype* top_diff = top[0]->cpu_diff();
 		const Dtype* bottom_data = bottom[0]->cpu_data();
 		// Gradient with respect to weight
 		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
-			top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
+				top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
 	}
 	if (bias_term_ && this->param_propagate_down_[1]) {
 		const Dtype* top_diff = top[0]->cpu_diff();
 		// Gradient with respect to bias
 		caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff,
-			bias_multiplier_.cpu_data(), (Dtype) 1.,
-			this->blobs_[1]->mutable_cpu_diff());
+				bias_multiplier_.cpu_data(), (Dtype) 1.,
+				this->blobs_[1]->mutable_cpu_diff());
 	}
 	if (propagate_down[0]) {
 		const Dtype* top_diff = top[0]->cpu_diff();
 		// Gradient with respect to bottom data
 		caffe_cpu_gemm < Dtype
-			> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
-				top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0.,
-				bottom[0]->mutable_cpu_diff());
+				> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
+						top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0.,
+						bottom[0]->mutable_cpu_diff());
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const Dtype* weight = this->blobs_[0]->gpu_data();
 	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
-		bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
+			bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
 	if (bias_term_) {
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1.,
-			bias_multiplier_.gpu_data(), 0,
-			this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
+				bias_multiplier_.gpu_data(), 0,
+				this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (this->param_propagate_down_[0]) {
 		const Dtype* top_diff = top[0]->gpu_diff();
 		const Dtype* bottom_data = bottom[0]->gpu_data();
 		// Gradient with respect to weight
 		caffe_gpu_gemm < Dtype
-			> (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
-				top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
+				> (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
+						top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
 	}
 	if (bias_term_ && this->param_propagate_down_[1]) {
 		const Dtype* top_diff = top[0]->gpu_diff();
 		// Gradient with respect to bias
 		caffe_gpu_gemv < Dtype
-			> (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff,
-				(size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
-				(size_t) 0, (Dtype) 0., 1,
-				this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
+				> (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff,
+						(size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
+						(size_t) 0, (Dtype) 0., 1,
+						this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
 	}
 	if (propagate_down[0]) {
 		const Dtype* top_diff = top[0]->gpu_diff();
 		// Gradient with respect to bottom data
 		caffe_gpu_gemm < Dtype
-			> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
-				top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0.,
-				bottom[0]->mutable_gpu_diff(), 0);
+				> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
+						top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0.,
+						bottom[0]->mutable_gpu_diff(), 0);
 	}
 }
 
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index e388dfef..60b08d99 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -7,9 +7,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	const Dtype base = this->layer_param_.log_param().base();
 	if (base != Dtype(-1)) {
@@ -19,22 +19,22 @@ void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	// Otherwise, calculate its log explicitly.
 	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
 	CHECK(!isnan(log_base))
-		<< "NaN result: log(base) = log(" << base << ") = " << log_base;
+			<< "NaN result: log(base) = log(" << base << ") = " << log_base;
 	CHECK(!isinf(log_base))
-		<< "Inf result: log(base) = log(" << base << ") = " << log_base;
+			<< "Inf result: log(base) = log(" << base << ") = " << log_base;
 	base_scale_ = Dtype(1) / log_base;
 	CHECK(!isnan(base_scale_))
-		<< "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
+			<< "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
 	CHECK(!isinf(base_scale_))
-		<< "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
+			<< "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
 	input_scale_ = this->layer_param_.log_param().scale();
 	input_shift_ = this->layer_param_.log_param().shift();
 	backward_num_scale_ = input_scale_ / log_base;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LogLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int count = bottom[0]->count();
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
@@ -55,9 +55,9 @@ void LogLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -79,9 +79,9 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int count = bottom[0]->count();
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
@@ -102,9 +102,9 @@ void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 503014f5..f5da913a 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -10,20 +10,20 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void LossLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	// LossLayers have a non-zero (1) loss by default.
 	if (this->layer_param_.loss_weight_size() == 0) {
 		this->layer_param_.add_loss_weight(Dtype(1));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-		<< "The data and label should have the same number.";
+			<< "The data and label should have the same number.";
 	vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
 	top[0]->Reshape(loss_shape);
 }
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 58f835b6..2dfcd645 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -8,9 +8,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	size_ = this->layer_param_.lrn_param().local_size();
 	CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
 	pre_pad_ = (size_ - 1) / 2;
@@ -18,7 +18,7 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	beta_ = this->layer_param_.lrn_param().beta();
 	k_ = this->layer_param_.lrn_param().k();
 	if (this->layer_param_.lrn_param().norm_region() ==
-		LRNParameter_NormRegion_WITHIN_CHANNEL) {
+			LRNParameter_NormRegion_WITHIN_CHANNEL) {
 		// Set up split_layer_ to use inputs in the numerator and denominator.
 		split_top_vec_.clear();
 		split_top_vec_.push_back(&product_input_);
@@ -40,7 +40,7 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 		pool_top_vec_.push_back(&pool_output_);
 		LayerParameter pool_param;
 		pool_param.mutable_pooling_param()->set_pool(
-			PoolingParameter_PoolMethod_AVE);
+				PoolingParameter_PoolMethod_AVE);
 		pool_param.mutable_pooling_param()->set_pad(pre_pad_);
 		pool_param.mutable_pooling_param()->set_kernel_size(size_);
 		pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
@@ -68,33 +68,33 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  num_ = bottom[0]->num();
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    top[0]->Reshape(num_, channels_, height_, width_);
-    scale_.Reshape(num_, channels_, height_, width_);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    split_layer_->Reshape(bottom, split_top_vec_);
-    square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
-    pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
-    power_layer_->Reshape(pool_top_vec_, power_top_vec_);
-    product_layer_->Reshape(product_bottom_vec_, top);
-    break;
-  }
+		const vector<Blob<Dtype>*>& top) {
+	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+			<< "corresponding to (num, channels, height, width)";
+	num_ = bottom[0]->num();
+	channels_ = bottom[0]->channels();
+	height_ = bottom[0]->height();
+	width_ = bottom[0]->width();
+	switch (this->layer_param_.lrn_param().norm_region()) {
+		case LRNParameter_NormRegion_ACROSS_CHANNELS:
+			top[0]->Reshape(num_, channels_, height_, width_);
+			scale_.Reshape(num_, channels_, height_, width_);
+			break;
+		case LRNParameter_NormRegion_WITHIN_CHANNEL:
+			split_layer_->Reshape(bottom, split_top_vec_);
+			square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
+			pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
+			power_layer_->Reshape(pool_top_vec_, power_top_vec_);
+			product_layer_->Reshape(product_bottom_vec_, top);
+			break;
+	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	switch (this->layer_param_.lrn_param().norm_region()) {
 		case LRNParameter_NormRegion_ACROSS_CHANNELS:
 			CrossChannelForward_cpu(bottom, top);
@@ -107,9 +107,9 @@ void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	Dtype* scale_data = scale_.mutable_cpu_data();
@@ -125,27 +125,27 @@ void LRNLayer<Dtype>::CrossChannelForward_cpu(
 	for (int n = 0; n < num_; ++n) {
 		// compute the padded square
 		caffe_sqr(channels_ * height_ * width_,
-			bottom_data + bottom[0]->offset(n),
-			padded_square_data + padded_square.offset(0, pre_pad_));
+				bottom_data + bottom[0]->offset(n),
+				padded_square_data + padded_square.offset(0, pre_pad_));
 		// Create the first channel scale
 		for (int c = 0; c < size_; ++c) {
 			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
-				padded_square_data + padded_square.offset(0, c),
-				scale_data + scale_.offset(n, 0));
+					padded_square_data + padded_square.offset(0, c),
+					scale_data + scale_.offset(n, 0));
 		}
 		for (int c = 1; c < channels_; ++c) {
 			// copy previous scale
 			caffe_copy < Dtype > (height_ * width_,
-				scale_data + scale_.offset(n, c - 1),
-				scale_data + scale_.offset(n, c));
+					scale_data + scale_.offset(n, c - 1),
+					scale_data + scale_.offset(n, c));
 			// add head
 			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
-				padded_square_data + padded_square.offset(0, c + size_ - 1),
-				scale_data + scale_.offset(n, c));
+					padded_square_data + padded_square.offset(0, c + size_ - 1),
+					scale_data + scale_.offset(n, c));
 			// subtract tail
 			caffe_axpy < Dtype > (height_ * width_, -alpha_over_size,
-				padded_square_data + padded_square.offset(0, c - 1),
-				scale_data + scale_.offset(n, c));
+					padded_square_data + padded_square.offset(0, c - 1),
+					scale_data + scale_.offset(n, c));
 		}
 	}
 
@@ -154,9 +154,9 @@ void LRNLayer<Dtype>::CrossChannelForward_cpu(
 	caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::WithinChannelForward(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	split_layer_->Forward(bottom, split_top_vec_);
 	square_layer_->Forward(square_bottom_vec_, square_top_vec_);
 	pool_layer_->Forward(square_top_vec_, pool_top_vec_);
@@ -164,9 +164,9 @@ void LRNLayer<Dtype>::WithinChannelForward(
 	product_layer_->Forward(product_bottom_vec_, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	switch (this->layer_param_.lrn_param().norm_region()) {
 		case LRNParameter_NormRegion_ACROSS_CHANNELS:
 			CrossChannelBackward_cpu(top, propagate_down, bottom);
@@ -179,10 +179,10 @@ void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_cpu(
-	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->cpu_diff();
 	const Dtype* top_data = top[0]->cpu_data();
 	const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -206,82 +206,82 @@ void LRNLayer<Dtype>::CrossChannelBackward_cpu(
 		int block_offset = scale_.offset(n);
 		// first, compute diff_i * y_i / s_i
 		caffe_mul < Dtype > (channels_ * height_ * width_,
-			top_diff + block_offset, top_data + block_offset,
-			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+				top_diff + block_offset, top_data + block_offset,
+				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
 		caffe_div < Dtype > (channels_ * height_ * width_,
-			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
-			scale_data + block_offset,
-			padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
+				scale_data + block_offset,
+				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
 		// Now, compute the accumulated ratios and the bottom diff
 		caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
 		for (int c = 0; c < size_ - 1; ++c) {
 			caffe_axpy < Dtype > (height_ * width_, 1.,
-				padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+					padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
 		}
 		for (int c = 0; c < channels_; ++c) {
 			caffe_axpy < Dtype > (height_ * width_, 1.,
-				padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
-				accum_ratio_data);
+					padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
+					accum_ratio_data);
 			// compute bottom diff
 			caffe_mul < Dtype > (height_ * width_,
-				bottom_data + top[0]->offset(n, c),
-				accum_ratio_data, accum_ratio_times_bottom);
+					bottom_data + top[0]->offset(n, c),
+					accum_ratio_data, accum_ratio_times_bottom);
 			caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value,
-				accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
+					accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
 			caffe_axpy < Dtype > (height_ * width_, -1.,
-				padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+					padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::WithinChannelBackward(
-	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		vector<bool> product_propagate_down(2, true);
 		product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
 		power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
 		pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
 		square_layer_->Backward(square_top_vec_, propagate_down,
-			square_bottom_vec_);
+				square_bottom_vec_);
 		split_layer_->Backward(split_top_vec_, propagate_down, bottom);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+	// First, compute scale
+	const Dtype* bottom_data = bottom[0]->gpu_data();
+	Dtype* top_data = top[0]->mutable_gpu_data();
+	Dtype* scale_data = scale_.mutable_gpu_data();
+	// We will launch one kernel for each pixel location, and have the kernel
+	// go through all the channels.
+	int n_threads = num_ * height_ * width_;
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
+			alpha_ / size_, k_, scale_data);
+	n_threads = bottom[0]->count();
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
+		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
+	int n_threads = num_ * height_ * width_;
+	// NOLINT_NEXT_LINE(whitespace/operators)
+	LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+			scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+			size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+			bottom[0]->mutable_gpu_diff());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	switch (this->layer_param_.lrn_param().norm_region()) {
 		case LRNParameter_NormRegion_ACROSS_CHANNELS:
 			CrossChannelForward_gpu(bottom, top);
@@ -294,9 +294,9 @@ void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	switch (this->layer_param_.lrn_param().norm_region()) {
 		case LRNParameter_NormRegion_ACROSS_CHANNELS:
 			CrossChannelBackward_gpu(top, propagate_down, bottom);
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 2cd04f93..e3b12908 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -8,17 +8,17 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	batch_size_ = this->layer_param_.memory_data_param().batch_size();
 	channels_ = this->layer_param_.memory_data_param().channels();
 	height_ = this->layer_param_.memory_data_param().height();
 	width_ = this->layer_param_.memory_data_param().width();
 	size_ = channels_ * height_ * width_;
 	CHECK_GT(batch_size_ * size_, 0) <<
-		"batch_size, channels, height, and width must be specified and"
-			" positive in memory_data_param";
+			"batch_size, channels, height, and width must be specified and"
+					" positive in memory_data_param";
 	vector<int> label_shape(1, batch_size_);
 	top[0]->Reshape(batch_size_, channels_, height_, width_);
 	top[1]->Reshape(label_shape);
@@ -30,14 +30,14 @@ void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	added_label_.cpu_data();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
 	CHECK(!has_new_data_) <<
-		"Can't add data until current data has been consumed.";
+			"Can't add data until current data has been consumed.";
 	size_t num = datum_vector.size();
 	CHECK_GT(num, 0) << "There is no datum to add.";
 	CHECK_EQ(num % batch_size_, 0) <<
-		"The added data must be a multiple of the batch size.";
+			"The added data must be a multiple of the batch size.";
 	added_data_.Reshape(num, channels_, height_, width_);
 	added_label_.Reshape(num, 1, 1, 1);
 	// Apply data transformations (mirror, scale, crop...)
@@ -53,15 +53,15 @@ void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
 	has_new_data_ = true;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
-	const vector<int>& labels) {
+		const vector<int>& labels) {
 	size_t num = mat_vector.size();
 	CHECK(!has_new_data_) <<
-		"Can't add mat until current data has been consumed.";
+			"Can't add mat until current data has been consumed.";
 	CHECK_GT(num, 0) << "There is no mat to add";
 	CHECK_EQ(num % batch_size_, 0) <<
-		"The added data must be a multiple of the batch size.";
+			"The added data must be a multiple of the batch size.";
 	added_data_.Reshape(num, channels_, height_, width_);
 	added_label_.Reshape(num, 1, 1, 1);
 	// Apply data transformations (mirror, scale, crop...)
@@ -77,7 +77,7 @@ void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
 	has_new_data_ = true;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
 	CHECK(data);
 	CHECK(labels);
@@ -93,18 +93,18 @@ void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
 	pos_ = 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
 	CHECK(!has_new_data_) <<
-		"Can't change batch_size until current data has been consumed.";
+			"Can't change batch_size until current data has been consumed.";
 	batch_size_ = new_size;
 	added_data_.Reshape(batch_size_, channels_, height_, width_);
 	added_label_.Reshape(batch_size_, 1, 1, 1);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
 	top[0]->Reshape(batch_size_, channels_, height_, width_);
 	top[1]->Reshape(batch_size_, 1, 1, 1);
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 5e57cf85..358ed891 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -10,18 +10,18 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::Reshape(bottom, top);
 	CHECK_EQ(bottom[1]->channels(), 1);
 	CHECK_EQ(bottom[1]->height(), 1);
 	CHECK_EQ(bottom[1]->width(), 1);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const Dtype* bottom_label = bottom[1]->cpu_data();
 	int num = bottom[0]->num();
@@ -30,19 +30,19 @@ void MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
 	for (int i = 0; i < num; ++i) {
 		int label = static_cast<int>(bottom_label[i]);
 		Dtype prob = std::max(
-			bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+				bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
 		loss -= log(prob);
 	}
 	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
-	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -55,7 +55,7 @@ void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
 		for (int i = 0; i < num; ++i) {
 			int label = static_cast<int>(bottom_label[i]);
 			Dtype prob = std::max(
-				bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+					bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
 			bottom_diff[i * dim + label] = scale / prob;
 		}
 	}
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 0bd4e989..0a6613d7 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -7,27 +7,27 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 	mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		1, 1);
+			1, 1);
 	variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		1, 1);
+			1, 1);
 	temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 	sum_multiplier_.Reshape(1, 1,
-		bottom[0]->height(), bottom[0]->width());
+			bottom[0]->height(), bottom[0]->width());
 	Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
 	caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
 	eps_ = this->layer_param_.mvn_param().eps();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	int num;
@@ -41,56 +41,56 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	if (this->layer_param_.mvn_param().normalize_variance()) {
 		// put the squares of bottom into temp_
 		caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
-			temp_.mutable_cpu_data());
+				temp_.mutable_cpu_data());
 
 		// computes variance using var(X) = E(X^2) - (EX)^2
 		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
 		caffe_cpu_gemv < Dtype
-			> (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
-				sum_multiplier_.cpu_data(), 0.,
-				variance_.mutable_cpu_data());  // E(X^2)
+				> (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
+						sum_multiplier_.cpu_data(), 0.,
+						variance_.mutable_cpu_data());  // E(X^2)
 		caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
-			temp_.mutable_cpu_data());  // (EX)^2
+				temp_.mutable_cpu_data());  // (EX)^2
 		caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
-			variance_.mutable_cpu_data());  // variance
+				variance_.mutable_cpu_data());  // variance
 
 		// do mean and variance normalization
 		// subtract mean
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-			temp_.mutable_cpu_data());
+				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+				temp_.mutable_cpu_data());
 
 		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
 
 		// normalize variance
 		caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
-			variance_.mutable_cpu_data());
+				variance_.mutable_cpu_data());
 
 		caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
 
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-			temp_.mutable_cpu_data());
+				variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+				temp_.mutable_cpu_data());
 
 		caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
 	} else {
 		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
 
 		// subtract mean
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-			temp_.mutable_cpu_data());
+				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+				temp_.mutable_cpu_data());
 
 		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->cpu_diff();
 	const Dtype* top_data = top[0]->cpu_data();
 	const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -107,27 +107,27 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	if (this->layer_param_.mvn_param().normalize_variance()) {
 		caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
 		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
-			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-			bottom_diff);
+				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+				bottom_diff);
 		caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
 
 		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
-			sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
-			bottom_diff);
+				mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
+				bottom_diff);
 
 		caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-			bottom_diff);
+				bottom_diff);
 
 		// put the squares of bottom into temp_
 		caffe_powx(temp_.count(), bottom_data, Dtype(2),
-			temp_.mutable_cpu_data());
+				temp_.mutable_cpu_data());
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-			temp_.mutable_cpu_data());
+				variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+				temp_.mutable_cpu_data());
 
 		caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
 	} else {
@@ -135,9 +135,9 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	int num;
@@ -151,55 +151,55 @@ void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	if (this->layer_param_.mvn_param().normalize_variance()) {
 		// put the squares of bottom into temp_
 		caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-			temp_.mutable_gpu_data());
+				temp_.mutable_gpu_data());
 
 		// computes variance using var(X) = E(X^2) - (EX)^2
 		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
 		caffe_gpu_gemv < Dtype
-			> (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-				sum_multiplier_.gpu_data(), 0.,
-				variance_.mutable_gpu_data());  // E(X^2)
+				> (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
+						sum_multiplier_.gpu_data(), 0.,
+						variance_.mutable_gpu_data());  // E(X^2)
 		caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-			temp_.mutable_gpu_data());  // (EX)^2
+				temp_.mutable_gpu_data());  // (EX)^2
 		caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-			variance_.mutable_gpu_data());  // variance
+				variance_.mutable_gpu_data());  // variance
 
 		// do mean and variance normalization
 		// subtract mean
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			temp_.mutable_gpu_data());
+				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				temp_.mutable_gpu_data());
 
 		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
 
 		// normalize variance
 		caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-			variance_.mutable_gpu_data());
+				variance_.mutable_gpu_data());
 
 		caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
 
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			temp_.mutable_gpu_data());
+				variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				temp_.mutable_gpu_data());
 
 		caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
 	} else {
 		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
+				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
 
 		// subtract mean
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			temp_.mutable_gpu_data());
+				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				temp_.mutable_gpu_data());
 
 		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->gpu_diff();
 	const Dtype* top_data = top[0]->gpu_data();
 	const Dtype* bottom_data = bottom[0]->gpu_data();
@@ -216,36 +216,36 @@ void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 	if (this->layer_param_.mvn_param().normalize_variance()) {
 		caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
 		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
-			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			bottom_diff);
+				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				bottom_diff);
 		caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
 
 		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
-			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-			bottom_diff);
+				mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
+				bottom_diff);
 
 		caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-			bottom_diff);
+				bottom_diff);
 
 		// put the squares of bottom into temp_
 		caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-			temp_.mutable_gpu_data());
+				temp_.mutable_gpu_data());
 
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-			variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			temp_.mutable_gpu_data());
+				variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				temp_.mutable_gpu_data());
 
 		caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
 	} else {
 		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff,
-			sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
 		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-			mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-			temp_.mutable_gpu_data());
+				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+				temp_.mutable_gpu_data());
 		caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
 	}
 }
diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp
index 2a0a2088..a9edeffd 100644
--- a/src/caffe/layers/neuron_layer.cpp
+++ b/src/caffe/layers/neuron_layer.cpp
@@ -5,9 +5,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void NeuronLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	top[0]->ReshapeLike(*bottom[0]);
 }
 
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index d66a24f6..92c71582 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -13,30 +13,30 @@ namespace caffe {
 using std::min;
 using std::max;
 
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	PoolingParameter pool_param = this->layer_param_.pooling_param();
 	if (pool_param.global_pooling()) {
 		CHECK(!(pool_param.has_kernel_size() ||
-			pool_param.has_kernel_h() || pool_param.has_kernel_w()))
-			<< "With Global_pooling: true Filter size cannot specified";
+				pool_param.has_kernel_h() || pool_param.has_kernel_w()))
+				<< "With Global_pooling: true Filter size cannot specified";
 	} else {
 		CHECK(!pool_param.has_kernel_size() !=
-			!(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+				!(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+				<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
 		CHECK(pool_param.has_kernel_size() ||
-			(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-			<< "For non-square filters both kernel_h and kernel_w are required.";
+				(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+				<< "For non-square filters both kernel_h and kernel_w are required.";
 	}
 	CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
-		&& pool_param.has_pad_w())
-		|| (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
-		<< "pad is pad OR pad_h and pad_w are required.";
+			&& pool_param.has_pad_w())
+			|| (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
+			<< "pad is pad OR pad_h and pad_w are required.";
 	CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
-		&& pool_param.has_stride_w())
-		|| (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
-		<< "Stride is stride OR stride_h and stride_w are required.";
+			&& pool_param.has_stride_w())
+			|| (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
+			<< "Stride is stride OR stride_h and stride_w are required.";
 	global_pooling_ = pool_param.global_pooling();
 	if (global_pooling_) {
 		kernel_h_ = bottom[0]->height();
@@ -65,24 +65,24 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 	if (global_pooling_) {
 		CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-			<< "With Global_pooling: true; only pad = 0 and stride = 1";
+				<< "With Global_pooling: true; only pad = 0 and stride = 1";
 	}
 	if (pad_h_ != 0 || pad_w_ != 0) {
 		CHECK(this->layer_param_.pooling_param().pool()
-			== PoolingParameter_PoolMethod_AVE
-			|| this->layer_param_.pooling_param().pool()
-				== PoolingParameter_PoolMethod_MAX)
-			<< "Padding implemented only for average and max pooling.";
+				== PoolingParameter_PoolMethod_AVE
+				|| this->layer_param_.pooling_param().pool()
+						== PoolingParameter_PoolMethod_MAX)
+				<< "Padding implemented only for average and max pooling.";
 		CHECK_LT(pad_h_, kernel_h_);
 		CHECK_LT(pad_w_, kernel_w_);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
+			<< "corresponding to (num, channels, height, width)";
 	channels_ = bottom[0]->channels();
 	height_ = bottom[0]->height();
 	width_ = bottom[0]->width();
@@ -91,9 +91,9 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		kernel_w_ = bottom[0]->width();
 	}
 	pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-		height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+			height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
 	pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-		width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+			width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
 	if (pad_h_ || pad_w_) {
 		// If we have padding, ensure that the last pooling starts strictly
 		// inside the image (instead of at the padding); otherwise clip the last.
@@ -107,29 +107,29 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
 	}
 	top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
-		pooled_width_);
+			pooled_width_);
 	if (top.size() > 1) {
 		top[1]->ReshapeLike(*top[0]);
 	}
 	// If max pooling, we will initialize the vector index part.
 	if (this->layer_param_.pooling_param().pool() ==
-		PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+			PoolingParameter_PoolMethod_MAX && top.size() == 1) {
 		max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-			pooled_width_);
+				pooled_width_);
 	}
 	// If stochastic pooling, we will initialize the random index part.
 	if (this->layer_param_.pooling_param().pool() ==
-		PoolingParameter_PoolMethod_STOCHASTIC) {
+			PoolingParameter_PoolMethod_STOCHASTIC) {
 		rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-			pooled_width_);
+				pooled_width_);
 	}
 }
 
 // TODO(Yangqing): Is there a faster way to do pooling in the channel-first
 // case?
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int top_count = top[0]->count();
@@ -209,7 +209,7 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 							for (int h = hstart; h < hend; ++h) {
 								for (int w = wstart; w < wend; ++w) {
 									top_data[ph * pooled_width_ + pw] +=
-										bottom_data[h * width_ + w];
+											bottom_data[h * width_ + w];
 								}
 							}
 							top_data[ph * pooled_width_ + pw] /= pool_size;
@@ -229,9 +229,9 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -258,7 +258,7 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 						for (int pw = 0; pw < pooled_width_; ++pw) {
 							const int index = ph * pooled_width_ + pw;
 							const int bottom_index =
-								use_top_mask ? top_mask[index] : mask[index];
+									use_top_mask ? top_mask[index] : mask[index];
 							bottom_diff[bottom_index] += top_diff[index];
 						}
 					}
@@ -290,7 +290,7 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 							for (int h = hstart; h < hend; ++h) {
 								for (int w = wstart; w < wend; ++w) {
 									bottom_diff[h * width_ + w] +=
-										top_diff[ph * pooled_width_ + pw] / pool_size;
+											top_diff[ph * pooled_width_ + pw] / pool_size;
 								}
 							}
 						}
@@ -309,9 +309,9 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	//Forward_cpu(bottom, top);
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
@@ -329,31 +329,31 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 			}
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_,
-				height_, width_, pooled_height_, pooled_width_, kernel_h_,
-				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-				mask, top_mask);
+					height_, width_, pooled_height_, pooled_width_, kernel_h_,
+					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
+					mask, top_mask);
 			break;
 		case PoolingParameter_PoolMethod_AVE:
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
-				height_, width_, pooled_height_, pooled_width_, kernel_h_,
-				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
+					height_, width_, pooled_height_, pooled_width_, kernel_h_,
+					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
 			break;
 		case PoolingParameter_PoolMethod_STOCHASTIC:
 			if (this->phase_ == TRAIN) {
 				// We need to create the random index as well.
 				caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-					rand_idx_.mutable_gpu_data());
+						rand_idx_.mutable_gpu_data());
 				// NOLINT_NEXT_LINE(whitespace/operators)
 				StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_, kernel_h_,
-					kernel_w_, stride_h_, stride_w_,
-					rand_idx_.mutable_gpu_data(), top_data);
+						height_, width_, pooled_height_, pooled_width_, kernel_h_,
+						kernel_w_, stride_h_, stride_w_,
+						rand_idx_.mutable_gpu_data(), top_data);
 			} else {
 				// NOLINT_NEXT_LINE(whitespace/operators)
 				StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_, kernel_h_,
-					kernel_w_, stride_h_, stride_w_, top_data);
+						height_, width_, pooled_height_, pooled_width_, kernel_h_,
+						kernel_w_, stride_h_, stride_w_, top_data);
 			}
 			break;
 		default:
@@ -361,9 +361,9 @@ void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	//Backward_cpu(top, propagate_down, bottom);
 	if (!propagate_down[0]) {
 		return;
@@ -385,22 +385,22 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 			}
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
-				height_, width_, pooled_height_, pooled_width_,
-				kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-				bottom_diff);
+					height_, width_, pooled_height_, pooled_width_,
+					kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
+					bottom_diff);
 			break;
 		case PoolingParameter_PoolMethod_AVE:
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			AvePoolBackward(count, top_diff, top[0]->num(), channels_,
-				height_, width_, pooled_height_, pooled_width_, kernel_h_,
-				kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+					height_, width_, pooled_height_, pooled_width_, kernel_h_,
+					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
 			break;
 		case PoolingParameter_PoolMethod_STOCHASTIC:
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			StoPoolBackward(count, rand_idx_.gpu_data(), top_diff,
-				top[0]->num(), channels_, height_, width_, pooled_height_,
-				pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-				bottom_diff);
+					top[0]->num(), channels_, height_, width_, pooled_height_,
+					pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
+					bottom_diff);
 			break;
 		default:
 			LOG(FATAL) << "Unknown pooling method.";
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index e4a3e456..93ef9e1f 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -9,9 +9,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	power_ = this->layer_param_.power_param().power();
 	scale_ = this->layer_param_.power_param().scale();
@@ -20,9 +20,9 @@ void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 // Compute y = (shift + scale * x)^power
-template<typename Dtype>
+template <typename Dtype>
 void PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
 	// Special case where we can ignore the input: scale or power is 0.
@@ -44,10 +44,10 @@ void PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
 		const int count = bottom[0]->count();
@@ -63,7 +63,7 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 				//     -> dy/dx = 2 * scale * (shift + scale * x)
 				//              = diff_scale * shift + diff_scale * scale * x
 				caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
-					Dtype(0), bottom_diff);
+						Dtype(0), bottom_diff);
 				if (shift_ != Dtype(0)) {
 					caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
 				}
@@ -96,9 +96,9 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
 	// Special case where we can ignore the input: scale or power is 0.
@@ -120,9 +120,9 @@ void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 		const int count = bottom[0]->count();
@@ -138,7 +138,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 				//     -> dy/dx = 2 * scale * (shift + scale * x)
 				//              = diff_scale * shift + diff_scale * scale * x
 				caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-					Dtype(0), bottom_diff);
+						Dtype(0), bottom_diff);
 				if (shift_ != Dtype(0)) {
 					caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
 				}
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 5332a178..cbf7f064 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -7,11 +7,11 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_GE(bottom[0]->num_axes(), 2)
-		<< "Number of axes of bottom blob must be >=2.";
+			<< "Number of axes of bottom blob must be >=2.";
 	PReLUParameter prelu_param = this->layer_param().prelu_param();
 	int channels = bottom[0]->channels();
 	channel_shared_ = prelu_param.channel_shared();
@@ -37,10 +37,10 @@ void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 	if (channel_shared_) {
 		CHECK_EQ(this->blobs_[0]->count(), 1)
-			<< "Negative slope size is inconsistent with prototxt config";
+				<< "Negative slope size is inconsistent with prototxt config";
 	} else {
 		CHECK_EQ(this->blobs_[0]->count(), channels)
-			<< "Negative slope size is inconsistent with prototxt config";
+				<< "Negative slope size is inconsistent with prototxt config";
 	}
 
 	// Propagate gradients to the parameters (as directed by backward pass).
@@ -50,11 +50,11 @@ void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_GE(bottom[0]->num_axes(), 2)
-		<< "Number of axes of bottom blob must be >=2.";
+			<< "Number of axes of bottom blob must be >=2.";
 	top[0]->ReshapeLike(*bottom[0]);
 	if (bottom[0] == top[0]) {
 		// For in-place computation
@@ -62,9 +62,9 @@ void PReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
@@ -83,14 +83,14 @@ void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	for (int i = 0; i < count; ++i) {
 		int c = (i / dim) % channels / div_factor;
 		top_data[i] = std::max(bottom_data[i], Dtype(0))
-			+ slope_data[c] * std::min(bottom_data[i], Dtype(0));
+				+ slope_data[c] * std::min(bottom_data[i], Dtype(0));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const Dtype* slope_data = this->blobs_[0]->cpu_data();
 	const Dtype* top_diff = top[0]->cpu_diff();
@@ -124,14 +124,14 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		for (int i = 0; i < count; ++i) {
 			int c = (i / dim) % channels / div_factor;
 			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-				+ slope_data[c] * (bottom_data[i] <= 0));
+					+ slope_data[c] * (bottom_data[i] <= 0));
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -144,12 +144,12 @@ void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
 	}
 	PReLUForward(count, channels, dim, bottom_data, top_data, slope_data,
-		div_factor);
+			div_factor);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	const Dtype* top_diff = top[0]->gpu_diff();
 	const int count = bottom[0]->count();
@@ -172,18 +172,18 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 			// compute element-wise diff
 			// NOLINT_NEXT_LINE(whitespace/operators)
 			PReLUParamBackward(
-				cdim, top_diff, top[0]->offset(n),
-				bottom_data, bottom[0]->offset(n),
-				backward_buff_.mutable_gpu_diff());
+					cdim, top_diff, top[0]->offset(n),
+					bottom_data, bottom[0]->offset(n),
+					backward_buff_.mutable_gpu_diff());
 			if (channel_shared_) {
 				Dtype d;
 				caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(),
-					multiplier_.gpu_data(), &d);
+						multiplier_.gpu_data(), &d);
 				dsum += d;
 			} else {
 				caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1.,
-					backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-					slope_diff);
+						backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
+						slope_diff);
 			}
 		}
 		if (channel_shared_) {
@@ -197,8 +197,8 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		int div_factor = channel_shared_ ? channels : 1;
 		// NOLINT_NEXT_LINE(whitespace/operators)
 		PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff,
-			slope_data,
-			div_factor);
+				slope_data,
+				div_factor);
 	}
 }
 
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 32ea4bc0..ddf70e46 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -8,29 +8,29 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	op_ = this->layer_param_.reduction_param().operation();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	axis_ = bottom[0]->CanonicalAxisIndex(
-		this->layer_param_.reduction_param().axis());
+			this->layer_param_.reduction_param().axis());
 	// In the output, we'll keep all axes up to the reduction axis, but
 	// throw away any after that.
 	// Note: currently reducing along non-tail axes is not supported; otherwise,
 	// we'd need to also copy any axes following an "end_axis".
 	vector<int> top_shape(bottom[0]->shape().begin(),
-		bottom[0]->shape().begin() + axis_);
+			bottom[0]->shape().begin() + axis_);
 	top[0]->Reshape(top_shape);
 	num_ = bottom[0]->count(0, axis_);
 	dim_ = bottom[0]->count(axis_);
 	CHECK_EQ(num_, top[0]->count());
 	if (op_ == ReductionParameter_ReductionOp_SUM ||
-		op_ == ReductionParameter_ReductionOp_MEAN) {
+			op_ == ReductionParameter_ReductionOp_MEAN) {
 		vector<int> sum_mult_shape(1, dim_);
 		sum_multiplier_.Reshape(sum_mult_shape);
 		caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
@@ -41,9 +41,9 @@ void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const Dtype* mult_data = NULL;
 	if (sum_multiplier_.count() > 0) {
@@ -64,7 +64,7 @@ void ReductionLayer<Dtype>::Forward_cpu(
 				break;
 			default:
 				LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
+						<< ReductionParameter_ReductionOp_Name(op_);
 		}
 		bottom_data += dim_;
 		++top_data;
@@ -76,9 +76,9 @@ void ReductionLayer<Dtype>::Forward_cpu(
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -96,7 +96,7 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 			break;
 		default:
 			LOG(FATAL) << "Unknown reduction op: "
-				<< ReductionParameter_ReductionOp_Name(op_);
+					<< ReductionParameter_ReductionOp_Name(op_);
 	}
 	const Dtype* top_diff = top[0]->cpu_diff();
 	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -116,7 +116,7 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 				break;
 			default:
 				LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
+						<< ReductionParameter_ReductionOp_Name(op_);
 		}
 		bottom_data += dim_;
 		bottom_diff += dim_;
@@ -124,9 +124,9 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	const Dtype* mult_data = NULL;
 	if (sum_multiplier_.count() > 0) {
@@ -147,7 +147,7 @@ void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 				break;
 			default:
 				LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
+						<< ReductionParameter_ReductionOp_Name(op_);
 		}
 		bottom_data += dim_;
 		++top_data;
@@ -159,9 +159,9 @@ void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -179,7 +179,7 @@ void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 			break;
 		default:
 			LOG(FATAL) << "Unknown reduction op: "
-				<< ReductionParameter_ReductionOp_Name(op_);
+					<< ReductionParameter_ReductionOp_Name(op_);
 	}
 	const Dtype* top_diff = top[0]->cpu_diff();
 	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
@@ -199,7 +199,7 @@ void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 				break;
 			default:
 				LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
+						<< ReductionParameter_ReductionOp_Name(op_);
 		}
 		bottom_data += dim_;
 		bottom_diff += dim_;
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 7f3b2729..334dc244 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -5,23 +5,23 @@
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
-template<typename Dtype>
+template <typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
 	Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
 	for (int i = 0; i < count; ++i) {
 		top_data[i] = std::max(bottom_data[i], Dtype(0))
-			+ negative_slope * std::min(bottom_data[i], Dtype(0));
+				+ negative_slope * std::min(bottom_data[i], Dtype(0));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->cpu_data();
 		const Dtype* top_diff = top[0]->cpu_diff();
@@ -30,14 +30,14 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
 		for (int i = 0; i < count; ++i) {
 			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-				+ negative_slope * (bottom_data[i] <= 0));
+					+ negative_slope * (bottom_data[i] <= 0));
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -45,10 +45,10 @@ void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	ReLUForward(count, bottom_data, top_data, negative_slope);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* bottom_data = bottom[0]->gpu_data();
 		const Dtype* top_diff = top[0]->gpu_diff();
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index 8dbbbcb0..094e61ef 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -5,9 +5,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	inferred_axis_ = -1;
 	copy_axes_.clear();
 	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
@@ -19,7 +19,7 @@ void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 			copy_axes_.push_back(i);
 		} else if (top_dim == -1) {
 			CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
-				<< "-1 dims; at most a single (1) value of -1 may be specified";
+					<< "-1 dims; at most a single (1) value of -1 may be specified";
 			inferred_axis_ = i;
 		} else {
 			constant_count_ *= top_dim;
@@ -27,22 +27,22 @@ void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int input_start_axis = this->layer_param_.reshape_param().axis();
 	const int start_axis =
-		(input_start_axis >= 0) ? input_start_axis :
-															bottom[0]->num_axes() + input_start_axis + 1;
+			(input_start_axis >= 0) ? input_start_axis :
+																bottom[0]->num_axes() + input_start_axis + 1;
 	CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
 	CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
-		<< " out of range for " << bottom[0]->num_axes() << "-D input blob";
+			<< " out of range for " << bottom[0]->num_axes() << "-D input blob";
 	const int num_axes = this->layer_param_.reshape_param().num_axes();
 	CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
 	const int end_axis =
-		(num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
+			(num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
 	CHECK_LE(end_axis, bottom[0]->num_axes())
-		<< "end_axis = axis + num_axes is out of range";
+			<< "end_axis = axis + num_axes is out of range";
 	const int num_axes_replaced = end_axis - start_axis;
 	const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
 	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
@@ -62,10 +62,10 @@ void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	for (int i = 0; i < copy_axes_.size(); ++i) {
 		const int copy_axis_index = copy_axes_[i];
 		CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
-			<< "new shape contains a 0, but there was no corresponding bottom axis "
-			<< "to copy";
+				<< "new shape contains a 0, but there was no corresponding bottom axis "
+				<< "to copy";
 		top_shape[start_axis + copy_axis_index] =
-			bottom[0]->shape(start_axis + copy_axis_index);
+				bottom[0]->shape(start_axis + copy_axis_index);
 	}
 	if (inferred_axis_ >= 0) {
 		// A -1 dim was specified; infer the correct dimension by computing the
@@ -78,14 +78,14 @@ void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 			explicit_count *= top_shape[start_axis + copy_axis_index];
 		}
 		CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
-			<< bottom[0]->count() << ") must be divisible by the product of "
-			<< "the specified dimensions (" << explicit_count << ")";
+				<< bottom[0]->count() << ") must be divisible by the product of "
+				<< "the specified dimensions (" << explicit_count << ")";
 		const int inferred_dim = bottom[0]->count() / explicit_count;
 		top_shape[start_axis + inferred_axis_] = inferred_dim;
 	}
 	top[0]->Reshape(top_shape);
 	CHECK_EQ(top[0]->count(), bottom[0]->count())
-		<< "output count must match input count";
+			<< "output count must match input count";
 	top[0]->ShareData(*bottom[0]);
 	top[0]->ShareDiff(*bottom[0]);
 }
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index a5be48e7..2a6d99e2 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -8,9 +8,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::LayerSetUp(bottom, top);
 	sigmoid_bottom_vec_.clear();
 	sigmoid_bottom_vec_.push_back(bottom[0]);
@@ -19,18 +19,18 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
 	sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::Reshape(bottom, top);
 	CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
-		"SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
+			"SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
 	sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	// The forward pass computes the sigmoid outputs.
 	sigmoid_bottom_vec_[0] = bottom[0];
 	sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
@@ -43,18 +43,18 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
 	Dtype loss = 0;
 	for (int i = 0; i < count; ++i) {
 		loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-			log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+				log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
 	}
 	top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
-	const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		// First, compute the diff
@@ -70,13 +70,13 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-	const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<Blob<Dtype>*>& top,
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		// First, compute the diff
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 4095ccdb..833e1ced 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -8,14 +8,14 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
 	return 1. / (1. + exp(-x));
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
@@ -24,10 +24,10 @@ void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_data = top[0]->cpu_data();
 		const Dtype* top_diff = top[0]->cpu_diff();
@@ -40,9 +40,9 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -50,9 +50,9 @@ void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	SigmoidForward(count, bottom_data, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_data = top[0]->gpu_data();
 		const Dtype* top_diff = top[0]->gpu_diff();
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index 05929a70..502d0aab 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -6,30 +6,30 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	for (int i = 0; i < bottom.size(); ++i) {
 		if (propagate_down[i]) {
 			caffe_set(bottom[i]->count(), Dtype(0),
-				bottom[i]->mutable_cpu_data());
+					bottom[i]->mutable_cpu_data());
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// Do nothing.
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	for (int i = 0; i < bottom.size(); ++i) {
 		if (propagate_down[i]) {
 			caffe_gpu_set(bottom[i]->count(), Dtype(0),
-				bottom[i]->mutable_gpu_data());
+					bottom[i]->mutable_gpu_data());
 		}
 	}
 }
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 7b327527..a005ceba 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -7,21 +7,21 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const SliceParameter& slice_param = this->layer_param_.slice_param();
 	CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
-		<< "Either axis or slice_dim should be specified; not both.";
+			<< "Either axis or slice_dim should be specified; not both.";
 	slice_point_.clear();
 	std::copy(slice_param.slice_point().begin(),
-		slice_param.slice_point().end(),
-		std::back_inserter(slice_point_));
+			slice_param.slice_point().end(),
+			std::back_inserter(slice_point_));
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const int num_axes = bottom[0]->num_axes();
 	const SliceParameter& slice_param = this->layer_param_.slice_param();
 	if (slice_param.has_slice_dim()) {
@@ -29,8 +29,8 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		// Don't allow negative indexing for slice_dim, a uint32 -- almost
 		// certainly unintended.
 		CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
-			<< "produced negative result; slice_dim must satisfy "
-			<< "0 <= slice_dim < " << kMaxBlobAxes;
+				<< "produced negative result; slice_dim must satisfy "
+				<< "0 <= slice_dim < " << kMaxBlobAxes;
 		CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
 	} else {
 		slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
@@ -58,8 +58,8 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		}
 	} else {
 		CHECK_EQ(bottom_slice_axis % top.size(), 0)
-			<< "Number of top blobs (" << top.size() << ") should evenly "
-			<< "divide input slice axis (" << bottom_slice_axis << ")";
+				<< "Number of top blobs (" << top.size() << ") should evenly "
+				<< "divide input slice axis (" << bottom_slice_axis << ")";
 		top_shape[slice_axis_] = bottom_slice_axis / top.size();
 		for (int i = 0; i < top.size(); ++i) {
 			top[i]->Reshape(top_shape);
@@ -69,9 +69,9 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	CHECK_EQ(count, bottom[0]->count());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	int offset_slice_axis = 0;
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -81,17 +81,17 @@ void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 		for (int n = 0; n < num_slices_; ++n) {
 			const int top_offset = n * top_slice_axis * slice_size_;
 			const int bottom_offset =
-				(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+					(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
 			caffe_copy(top_slice_axis * slice_size_,
-				bottom_data + bottom_offset, top_data + top_offset);
+					bottom_data + bottom_offset, top_data + top_offset);
 		}
 		offset_slice_axis += top_slice_axis;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -104,22 +104,22 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		for (int n = 0; n < num_slices_; ++n) {
 			const int top_offset = n * top_slice_axis * slice_size_;
 			const int bottom_offset =
-				(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
+					(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
 			caffe_copy(top_slice_axis * slice_size_,
-				top_diff + top_offset, bottom_diff + bottom_offset);
+					top_diff + top_offset, bottom_diff + bottom_offset);
 		}
 		offset_slice_axis += top_slice_axis;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 24d1e4b8..d4cab577 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -7,11 +7,11 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	softmax_axis_ =
-		bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+			bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
 	top[0]->ReshapeLike(*bottom[0]);
 	vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
 	sum_multiplier_.Reshape(mult_dims);
@@ -24,13 +24,13 @@ void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	scale_.Reshape(scale_dims);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 SoftmaxLayer<Dtype>::~SoftmaxLayer() {
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	Dtype* scale_data = scale_.mutable_cpu_data();
@@ -45,17 +45,17 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 		for (int j = 0; j < channels; j++) {
 			for (int k = 0; k < inner_num_; k++) {
 				scale_data[k] = std::max(scale_data[k],
-					bottom_data[i * dim + j * inner_num_ + k]);
+						bottom_data[i * dim + j * inner_num_ + k]);
 			}
 		}
 		// subtraction
 		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_,
-			1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
+				1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
 		// exponentiation
 		caffe_exp < Dtype > (dim, top_data, top_data);
 		// sum after exp
 		caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1.,
-			top_data, sum_multiplier_.cpu_data(), 0., scale_data);
+				top_data, sum_multiplier_.cpu_data(), 0., scale_data);
 		// division
 		for (int j = 0; j < channels; j++) {
 			caffe_div(inner_num_, top_data, scale_data, top_data);
@@ -64,10 +64,10 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->cpu_diff();
 	const Dtype* top_data = top[0]->cpu_data();
 	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -79,21 +79,22 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		// compute dot(top_diff, top_data) and subtract them from the bottom diff
 		for (int k = 0; k < inner_num_; ++k) {
 			scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels,
-				bottom_diff + i * dim + k, inner_num_,
-				top_data + i * dim + k, inner_num_);
+					bottom_diff + i * dim + k, inner_num_,
+					top_data + i * dim + k, inner_num_);
 		}
 		// subtraction
 		caffe_cpu_gemm < Dtype
-			> (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
-				-1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
+				> (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
+						-1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff
+								+ i * dim);
 	}
 	// elementwise multiplication
 	caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	Dtype* scale_data = scale_.mutable_gpu_data();
@@ -107,27 +108,27 @@ void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	// NOLINT_NEXT_LINE(whitespace/operators)
 
 	kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data,
-		scale_data);
+			scale_data);
 	// subtract
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
-		scale_data, top_data);
+			scale_data, top_data);
 	// exponentiate
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_exp < Dtype > (count, top_data, top_data);
 	// sum after exp
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data,
-		scale_data);
+			scale_data);
 	// divide
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_,
-		scale_data, top_data);
+			scale_data, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	const Dtype* top_diff = top[0]->gpu_diff();
 	const Dtype* top_data = top[0]->gpu_data();
 	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
@@ -139,10 +140,10 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 	// NOLINT_NEXT_LINE(whitespace/operators)
 
 	kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_,
-		top_diff, top_data, scale_data);
+			top_diff, top_data, scale_data);
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
-		scale_data, bottom_diff);
+			scale_data, bottom_diff);
 	// elementwise multiplication
 	caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index b998c2f6..58872a72 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -9,9 +9,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::LayerSetUp(bottom, top);
 	LayerParameter softmax_param(this->layer_param_);
 	softmax_param.set_type("Softmax");
@@ -23,7 +23,7 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
 	softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
 
 	has_ignore_label_ =
-		this->layer_param_.loss_param().has_ignore_label();
+			this->layer_param_.loss_param().has_ignore_label();
 	if (has_ignore_label_) {
 		ignore_label_ = this->layer_param_.loss_param().ignore_label();
 	}
@@ -32,40 +32,40 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
 	ocl_setup();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::ocl_setup() {
 	d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
-		sizeof(Dtype), NULL, NULL);
+			sizeof(Dtype), NULL, NULL);
 
 }
 
-template<typename Dtype>
+template <typename Dtype>
 SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer() {
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Reshape(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	LossLayer < Dtype > ::Reshape(bottom, top);
 	softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
 	softmax_axis_ =
-		bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+			bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
 	outer_num_ = bottom[0]->count(0, softmax_axis_);
 	inner_num_ = bottom[0]->count(softmax_axis_ + 1);
 	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-		<< "Number of labels must match number of predictions; "
-		<< "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
-		<< "label count (number of labels) must be N*H*W, "
-		<< "with integer values in {0, 1, ..., C-1}.";
+			<< "Number of labels must match number of predictions; "
+			<< "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
+			<< "label count (number of labels) must be N*H*W, "
+			<< "with integer values in {0, 1, ..., C-1}.";
 	if (top.size() >= 2) {
 		// softmax output
 		top[1]->ReshapeLike(*bottom[0]);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	// The forward pass computes the softmax prob values.
 	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
 	const Dtype* prob_data = prob_.cpu_data();
@@ -82,7 +82,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
 			DCHECK_GE(label_value, 0);
 			DCHECK_LT(label_value, prob_.shape(softmax_axis_));
 			loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
-				Dtype(FLT_MIN)));
+					Dtype(FLT_MIN)));
 			++count;
 		}
 	}
@@ -96,12 +96,12 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -133,9 +133,9 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
 	const Dtype* prob_data = prob_.gpu_data();
 	const Dtype* label = bottom[1]->gpu_data();
@@ -150,7 +150,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
 	Dtype* counts = prob_.mutable_gpu_diff();
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data,
-		outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+			outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
 	Dtype loss;
 	caffe_gpu_asum(nthreads, loss_data, &loss);
 	if (normalize_) {
@@ -167,12 +167,12 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[1]) {
 		LOG(FATAL) << this->type()
-			<< " Layer cannot backpropagate to label inputs.";
+				<< " Layer cannot backpropagate to label inputs.";
 	}
 	if (propagate_down[0]) {
 		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
@@ -188,7 +188,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		Dtype* counts = prob_.mutable_gpu_diff();
 		// NOLINT_NEXT_LINE(whitespace/operators)
 		SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff,
-			outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+				outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
 		const Dtype loss_weight = top[0]->cpu_diff()[0];
 		if (normalize_) {
 			Dtype count;
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 0ad8179a..8b19d293 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -6,9 +6,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	count_ = bottom[0]->count();
 	for (int i = 0; i < top.size(); ++i) {
 		// Do not allow in-place computation in the SplitLayer.  Instead, share data
@@ -17,25 +17,25 @@ void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 		// blob of the first split output with the input, but this seems to cause
 		// some strange effects in practice...)
 		CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
-			"allow in-place computation.";
+				"allow in-place computation.";
 		top[i]->ReshapeLike(*bottom[0]);
 		CHECK_EQ(count_, top[i]->count());
 	}
 	gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float",
-		NULL);
+			NULL);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	for (int i = 0; i < top.size(); ++i) {
 		top[i]->ShareData(*bottom[0]);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -44,7 +44,7 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 		return;
 	}
 	caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
-		bottom[0]->mutable_cpu_diff());
+			bottom[0]->mutable_cpu_diff());
 	// Add remaining top blob diffs.
 	for (int i = 2; i < top.size(); ++i) {
 		const Dtype* top_diff = top[i]->cpu_diff();
@@ -53,17 +53,17 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	for (int i = 0; i < top.size(); ++i) {
 		top[i]->ShareData(*bottom[0]);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -72,7 +72,7 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		return;
 	}
 	caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-		bottom[0]->mutable_gpu_diff());
+			bottom[0]->mutable_gpu_diff());
 	// Add remaining top blob diffs.
 	for (int i = 2; i < top.size(); ++i) {
 		const Dtype* top_diff = top[i]->gpu_diff();
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
index bfc7778c..4c630fb7 100644
--- a/src/caffe/layers/spp_layer.cpp
+++ b/src/caffe/layers/spp_layer.cpp
@@ -13,9 +13,9 @@ namespace caffe {
 using std::min;
 using std::max;
 
-template<typename Dtype>
+template <typename Dtype>
 LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
-	const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+		const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
 	LayerParameter pooling_param;
 	int num_bins = pow(2, pyramid_level);
 
@@ -44,15 +44,15 @@ LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
 	switch (spp_param.pool()) {
 		case SPPParameter_PoolMethod_MAX:
 			pooling_param.mutable_pooling_param()->set_pool(
-				PoolingParameter_PoolMethod_MAX);
+					PoolingParameter_PoolMethod_MAX);
 			break;
 		case SPPParameter_PoolMethod_AVE:
 			pooling_param.mutable_pooling_param()->set_pool(
-				PoolingParameter_PoolMethod_AVE);
+					PoolingParameter_PoolMethod_AVE);
 			break;
 		case SPPParameter_PoolMethod_STOCHASTIC:
 			pooling_param.mutable_pooling_param()->set_pool(
-				PoolingParameter_PoolMethod_STOCHASTIC);
+					PoolingParameter_PoolMethod_STOCHASTIC);
 			break;
 		default:
 			LOG(FATAL) << "Unknown pooling method.";
@@ -61,9 +61,9 @@ LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
 	return pooling_param;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	SPPParameter spp_param = this->layer_param_.spp_param();
 
 	bottom_h_ = bottom[0]->height();
@@ -104,10 +104,10 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 		// pooling layer setup
 		LayerParameter pooling_param = GetPoolingParam(
-			i, bottom_h_, bottom_w_, spp_param);
+				i, bottom_h_, bottom_w_, spp_param);
 
 		pooling_layers_.push_back(shared_ptr < PoolingLayer<Dtype> > (
-			new PoolingLayer<Dtype>(pooling_param)));
+				new PoolingLayer<Dtype>(pooling_param)));
 		pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
 
 		// flatten layer output holders setup
@@ -130,11 +130,11 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	concat_layer_->SetUp(concat_bottom_vec_, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-		<< "corresponding to (num, channels, height, width)";
+			<< "corresponding to (num, channels, height, width)";
 	channels_ = bottom[0]->channels();
 	bottom_h_ = bottom[0]->height();
 	bottom_w_ = bottom[0]->width();
@@ -142,36 +142,36 @@ void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 	split_layer_->Reshape(bottom, split_top_vec_);
 	for (int i = 0; i < pyramid_height_; i++) {
 		LayerParameter pooling_param = GetPoolingParam(
-			i, bottom_h_, bottom_w_, spp_param);
+				i, bottom_h_, bottom_w_, spp_param);
 
 		pooling_layers_[i].reset(
-			new PoolingLayer<Dtype>(pooling_param));
+				new PoolingLayer<Dtype>(pooling_param));
 		pooling_layers_[i]->SetUp(
-			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
 		pooling_layers_[i]->Reshape(
-			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
 		flatten_layers_[i]->Reshape(
-			*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+				*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
 	}
 	concat_layer_->Reshape(concat_bottom_vec_, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	split_layer_->Forward(bottom, split_top_vec_);
 	for (int i = 0; i < pyramid_height_; i++) {
 		pooling_layers_[i]->Forward(
-			*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
 		flatten_layers_[i]->Forward(
-			*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+				*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
 	}
 	concat_layer_->Forward(concat_bottom_vec_, top);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (!propagate_down[0]) {
 		return;
 	}
@@ -179,9 +179,9 @@ void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
 	for (int i = 0; i < pyramid_height_; i++) {
 		flatten_layers_[i]->Backward(
-			*flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
+				*flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
 		pooling_layers_[i]->Backward(
-			*pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
+				*pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
 	}
 	split_layer_->Backward(split_top_vec_, propagate_down, bottom);
 }
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 16405761..52a8a8c7 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -10,9 +10,9 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
@@ -21,10 +21,10 @@ void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down,
-	const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down,
+		const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_data = top[0]->cpu_data();
 		const Dtype* top_diff = top[0]->cpu_diff();
@@ -38,9 +38,9 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
@@ -48,9 +48,9 @@ void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	TanHForward(count, bottom_data, top_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-	const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 	if (propagate_down[0]) {
 		const Dtype* top_data = top[0]->gpu_data();
 		const Dtype* top_diff = top[0]->gpu_diff();
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index ca14de00..7d99226f 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -6,16 +6,16 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 void ThresholdLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
 	threshold_ = this->layer_param_.threshold_param().threshold();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->cpu_data();
 	Dtype* top_data = top[0]->mutable_cpu_data();
 	const int count = bottom[0]->count();
@@ -24,9 +24,9 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	const Dtype* bottom_data = bottom[0]->gpu_data();
 	Dtype* top_data = top[0]->mutable_gpu_data();
 	const int count = bottom[0]->count();
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 0525b640..68b1b1e5 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -25,14 +25,14 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
 	this->JoinPrefetchThread();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-	const vector<Blob<Dtype>*>& top) {
+		const vector<Blob<Dtype>*>& top) {
 	// LayerSetUp runs through the window_file and creates two structures
 	// that hold windows: one for foreground (object) windows and one
 	// for background (non-object) windows. We use an overlap threshold
@@ -49,23 +49,23 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	//    class_index overlap x1 y1 x2 y2
 
 	LOG(INFO) << "Window data layer:" << std::endl
-		<< "  foreground (object) overlap threshold: "
-		<< this->layer_param_.window_data_param().fg_threshold() << std::endl
-		<< "  background (non-object) overlap threshold: "
-		<< this->layer_param_.window_data_param().bg_threshold() << std::endl
-		<< "  foreground sampling fraction: "
-		<< this->layer_param_.window_data_param().fg_fraction() << std::endl
-		<< "  cache_images: "
-		<< this->layer_param_.window_data_param().cache_images() << std::endl
-		<< "  root_folder: "
-		<< this->layer_param_.window_data_param().root_folder();
+			<< "  foreground (object) overlap threshold: "
+			<< this->layer_param_.window_data_param().fg_threshold() << std::endl
+			<< "  background (non-object) overlap threshold: "
+			<< this->layer_param_.window_data_param().bg_threshold() << std::endl
+			<< "  foreground sampling fraction: "
+			<< this->layer_param_.window_data_param().fg_fraction() << std::endl
+			<< "  cache_images: "
+			<< this->layer_param_.window_data_param().cache_images() << std::endl
+			<< "  root_folder: "
+			<< this->layer_param_.window_data_param().root_folder();
 
 	cache_images_ = this->layer_param_.window_data_param().cache_images();
 	string root_folder = this->layer_param_.window_data_param().root_folder();
 
 	const bool prefetch_needs_rand =
-		this->transform_param_.mirror() ||
-			this->transform_param_.crop_size();
+			this->transform_param_.mirror() ||
+					this->transform_param_.crop_size();
 	if (prefetch_needs_rand) {
 		const unsigned int prefetch_rng_seed = caffe_rng_rand();
 		prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
@@ -75,7 +75,7 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 	std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
 	CHECK(infile.good()) << "Failed to open window file "
-		<< this->layer_param_.window_data_param().source() << std::endl;
+			<< this->layer_param_.window_data_param().source() << std::endl;
 
 	map<int, int> label_hist;
 	label_hist.insert(std::make_pair(0, 0));
@@ -109,9 +109,9 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 		int num_windows;
 		infile >> num_windows;
 		const float fg_threshold =
-			this->layer_param_.window_data_param().fg_threshold();
+				this->layer_param_.window_data_param().fg_threshold();
 		const float bg_threshold =
-			this->layer_param_.window_data_param().bg_threshold();
+				this->layer_param_.window_data_param().bg_threshold();
 		for (int i = 0; i < num_windows; ++i) {
 			int label, x1, y1, x2, y2;
 			float overlap;
@@ -144,27 +144,27 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 		if (image_index % 100 == 0) {
 			LOG(INFO) << "num: " << image_index << " "
-				<< image_path << " "
-				<< image_size[0] << " "
-				<< image_size[1] << " "
-				<< image_size[2] << " "
-				<< "windows to process: " << num_windows;
+					<< image_path << " "
+					<< image_size[0] << " "
+					<< image_size[1] << " "
+					<< image_size[2] << " "
+					<< "windows to process: " << num_windows;
 		}
 	} while (infile >> hashtag >> image_index);
 
 	LOG(INFO) << "Number of images: " << image_index + 1;
 
 	for (map<int, int>::iterator it = label_hist.begin();
-		it != label_hist.end(); ++it) {
+			it != label_hist.end(); ++it) {
 		LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
-			<< " samples";
+				<< " samples";
 	}
 
 	LOG(INFO) << "Amount of context padding: "
-		<< this->layer_param_.window_data_param().context_pad();
+			<< this->layer_param_.window_data_param().context_pad();
 
 	LOG(INFO) << "Crop mode: "
-		<< this->layer_param_.window_data_param().crop_mode();
+			<< this->layer_param_.window_data_param().crop_mode();
 
 	// image
 	const int crop_size = this->transform_param_.crop_size();
@@ -174,8 +174,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
 
 	LOG(INFO) << "output data size: " << top[0]->num() << ","
-		<< top[0]->channels() << "," << top[0]->height() << ","
-		<< top[0]->width();
+			<< top[0]->channels() << "," << top[0]->height() << ","
+			<< top[0]->width();
 	// label
 	vector<int> label_shape(1, batch_size);
 	top[1]->Reshape(label_shape);
@@ -186,7 +186,7 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	has_mean_values_ = this->transform_param_.mean_value_size() > 0;
 	if (has_mean_file_) {
 		const string& mean_file =
-			this->transform_param_.mean_file();
+				this->transform_param_.mean_file();
 		LOG(INFO) << "Loading mean file from: " << mean_file;
 		BlobProto blob_proto;
 		ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
@@ -194,12 +194,12 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 	if (has_mean_values_) {
 		CHECK(has_mean_file_ == false) <<
-			"Cannot specify mean_file and mean_value at the same time";
+				"Cannot specify mean_file and mean_value at the same time";
 		for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
 			mean_values_.push_back(this->transform_param_.mean_value(c));
 		}
 		CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
-			"Specify either 1 mean_value or as many as channels: " << channels;
+				"Specify either 1 mean_value or as many as channels: " << channels;
 		if (channels > 1 && mean_values_.size() == 1) {
 			// Replicate the mean_value for simplicity
 			for (int c = 1; c < channels; ++c) {
@@ -209,16 +209,16 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
 	CHECK (prefetch_rng_);
 	caffe::rng_t* prefetch_rng =
-		static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+			static_cast<caffe::rng_t*>(prefetch_rng_->generator());
 	return (*prefetch_rng)();
 }
 
 // Thread fetching the data
-template<typename Dtype>
+template <typename Dtype>
 void WindowDataLayer<Dtype>::InternalThreadEntry() {
 	// At each iteration, sample N windows where N*p are foreground (object)
 	// windows and N*(1-p) are background (non-object) windows
@@ -235,7 +235,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 	const int crop_size = this->transform_param_.crop_size();
 	const bool mirror = this->transform_param_.mirror();
 	const float fg_fraction =
-		this->layer_param_.window_data_param().fg_fraction();
+			this->layer_param_.window_data_param().fg_fraction();
 	Dtype* mean = NULL;
 	int mean_off = 0;
 	int mean_width = 0;
@@ -255,7 +255,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 	caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
 
 	const int num_fg = static_cast<int>(static_cast<float>(batch_size)
-		* fg_fraction);
+			* fg_fraction);
 	const int num_samples[2] = { batch_size - num_fg, num_fg };
 
 	int item_id = 0;
@@ -266,20 +266,20 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 			timer.Start();
 			const unsigned int rand_index = PrefetchRand();
 			vector<float> window =
-				(is_fg) ?
-									fg_windows_[rand_index % fg_windows_.size()] :
-									bg_windows_[rand_index % bg_windows_.size()];
+					(is_fg) ?
+										fg_windows_[rand_index % fg_windows_.size()] :
+										bg_windows_[rand_index % bg_windows_.size()];
 
 			bool do_mirror = mirror && PrefetchRand() % 2;
 
 			// load the image containing the window
 			pair<std::string, vector<int> > image =
-				image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+					image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
 
 			cv::Mat cv_img;
 			if (this->cache_images_) {
 				pair < std::string, Datum > image_cached =
-					image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+						image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
 				cv_img = DecodeDatumToCVMat(image_cached.second, true);
 			} else {
 				cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
@@ -305,7 +305,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 				// such that after warping the expanded region to crop_size x crop_size
 				// there's exactly context_pad amount of padding on each side
 				Dtype context_scale = static_cast<Dtype>(crop_size) /
-					static_cast<Dtype>(crop_size - 2 * context_pad);
+						static_cast<Dtype>(crop_size - 2 * context_pad);
 
 				// compute the expanded region
 				Dtype half_height = static_cast<Dtype>(y2 - y1 + 1) / 2.0;
@@ -349,15 +349,16 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 				// scale factors that would be used to warp the unclipped
 				// expanded region
 				Dtype scale_x =
-					static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_width);
+						static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_width);
 				Dtype scale_y =
-					static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_height);
+						static_cast<Dtype>(crop_size)
+								/ static_cast<Dtype>(unclipped_height);
 
 				// size to warp the clipped expanded region to
 				cv_crop_size.width =
-					static_cast<int>(round(static_cast<Dtype>(clipped_width) * scale_x));
+						static_cast<int>(round(static_cast<Dtype>(clipped_width) * scale_x));
 				cv_crop_size.height =
-					static_cast<int>(round(static_cast<Dtype>(clipped_height) * scale_y));
+						static_cast<int>(round(static_cast<Dtype>(clipped_height) * scale_y));
 				pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1) * scale_x));
 				pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2) * scale_x));
 				pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1) * scale_y));
@@ -384,7 +385,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 			cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1);
 			cv::Mat cv_cropped_img = cv_img(roi);
 			cv::resize(cv_cropped_img, cv_cropped_img,
-				cv_crop_size, 0, 0, cv::INTER_LINEAR);
+					cv_crop_size, 0, 0, cv::INTER_LINEAR);
 
 			// horizontal flip at random
 			if (do_mirror) {
@@ -398,12 +399,12 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 				for (int w = 0; w < cv_cropped_img.cols; ++w) {
 					for (int c = 0; c < channels; ++c) {
 						int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
-							* crop_size + w + pad_w;
+								* crop_size + w + pad_w;
 						// int top_index = (c * height + h) * width + w;
 						Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
 						if (this->has_mean_file_) {
 							int mean_index = (c * mean_height + h + mean_off + pad_h)
-								* mean_width + w + mean_off + pad_w;
+									* mean_width + w + mean_off + pad_w;
 							top_data[top_index] = (pixel - mean[mean_index]) * scale;
 						} else {
 							if (this->has_mean_values_) {
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 53ec5461..23085112 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -19,12 +19,12 @@
 
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 Net<Dtype>::Net(const NetParameter& param) {
 	Init(param);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Net<Dtype>::Net(const string& param_file, Phase phase) {
 	NetParameter param;
 	ReadNetParamsFromTextFileOrDie(param_file, &param);
@@ -32,7 +32,7 @@ Net<Dtype>::Net(const string& param_file, Phase phase) {
 	Init(param);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
 	// Set phase from the state.
 	phase_ = in_param.state().phase();
@@ -41,7 +41,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 	NetParameter filtered_param;
 	FilterNet(in_param, &filtered_param);
 	LOG(INFO) << "Initializing net from parameters: " << std::endl
-		<< filtered_param.DebugString();
+			<< filtered_param.DebugString();
 	// Create a copy of filtered_param with splits added where necessary.
 	NetParameter param;
 	InsertSplits(filtered_param, &param);
@@ -50,14 +50,14 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 	map<string, int> blob_name_to_idx;
 	set < string > available_blobs;
 	CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
-		<< "Must specify either input_shape OR deprecated input_dim, not both.";
+			<< "Must specify either input_shape OR deprecated input_dim, not both.";
 	if (param.input_dim_size() > 0) {
 		// Deprecated 4D dimensions.
 		CHECK_EQ(param.input_size() * 4, param.input_dim_size())
-			<< "Incorrect input blob dimension specifications.";
+				<< "Incorrect input blob dimension specifications.";
 	} else {
 		CHECK_EQ(param.input_size(), param.input_shape_size())
-			<< "Exactly one input_shape must be specified per input.";
+				<< "Exactly one input_shape must be specified per input.";
 	}
 	memory_used_ = 0;
 	// set the input blobs
@@ -82,9 +82,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		const LayerParameter& layer_param = param.layer(layer_id);
 		if (layer_param.propagate_down_size() > 0) {
 			CHECK_EQ(layer_param.propagate_down_size(),
-				layer_param.bottom_size())
-				<< "propagate_down param must be specified "
-				<< "either 0 or bottom_size times ";
+					layer_param.bottom_size())
+					<< "propagate_down param must be specified "
+					<< "either 0 or bottom_size times ";
 		}
 		layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param));
 		layer_names_.push_back(layer_param.name());
@@ -93,9 +93,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 
 		// Figure out this layer's input and output
 		for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
-			++bottom_id) {
+				++bottom_id) {
 			const int blob_id = AppendBottom(param, layer_id, bottom_id,
-				&available_blobs, &blob_name_to_idx);
+					&available_blobs, &blob_name_to_idx);
 			// If a blob needs backward, this layer should provide it.
 			need_backward |= blob_need_backward_[blob_id];
 		}
@@ -109,7 +109,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		Layer < Dtype > *layer = layers_[layer_id].get();
 		if (layer->AutoTopBlobs()) {
 			const int needed_num_top =
-				std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
+					std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
 			for (; num_top < needed_num_top; ++num_top) {
 				// Add "anonymous" top blobs -- do not modify available_blobs or
 				// blob_name_to_idx as we don't want these blobs to be usable as input
@@ -135,17 +135,17 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		const int param_size = layer_param.param_size();
 		const int num_param_blobs = layers_[layer_id]->blobs().size();
 		CHECK_LE(param_size, num_param_blobs)
-			<< "Too many params specified for layer " << layer_param.name();
+				<< "Too many params specified for layer " << layer_param.name();
 		ParamSpec default_param_spec;
 		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
 			const ParamSpec* param_spec =
-				(param_id < param_size) ?
-																	&layer_param.param(param_id) :
-																	&default_param_spec;
+					(param_id < param_size) ?
+																		&layer_param.param(param_id) :
+																		&default_param_spec;
 			const bool param_need_backward = param_spec->lr_mult() > 0;
 			need_backward |= param_need_backward;
 			layers_[layer_id]->set_param_propagate_down(param_id,
-				param_need_backward);
+					param_need_backward);
 		}
 		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
 			AppendParam(param, layer_id, param_id);
@@ -172,7 +172,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
 			const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
 			if (layers_[layer_id]->loss(top_id) ||
-				(blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
+					(blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
 				layer_contributes_loss = true;
 			}
 			if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
@@ -186,7 +186,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
 			layer_need_backward_[layer_id] = false;
 			for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-				++bottom_id) {
+					++bottom_id) {
 				bottom_need_backward_[layer_id][bottom_id] = false;
 			}
 		}
@@ -197,20 +197,20 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 			LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
 		} else {
 			LOG(INFO) << layer_names_[layer_id]
-				<< " does not need backward computation.";
+					<< " does not need backward computation.";
 		}
 		for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-			++bottom_id) {
+				++bottom_id) {
 			if (layer_contributes_loss) {
 				const string& blob_name =
-					blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+						blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
 				blobs_under_loss.insert(blob_name);
 			} else {
 				bottom_need_backward_[layer_id][bottom_id] = false;
 			}
 			if (!bottom_need_backward_[layer_id][bottom_id]) {
 				const string& blob_name =
-					blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+						blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
 				blobs_skip_backp.insert(blob_name);
 			}
 		}
@@ -220,23 +220,23 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 		for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
 			layer_need_backward_[layer_id] = true;
 			for (int bottom_id = 0;
-				bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
+					bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
 				bottom_need_backward_[layer_id][bottom_id] =
-					bottom_need_backward_[layer_id][bottom_id] ||
-						layers_[layer_id]->AllowForceBackward(bottom_id);
+						bottom_need_backward_[layer_id][bottom_id] ||
+								layers_[layer_id]->AllowForceBackward(bottom_id);
 				blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
-					blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
-						bottom_need_backward_[layer_id][bottom_id];
+						blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
+								bottom_need_backward_[layer_id][bottom_id];
 			}
 			for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-				++param_id) {
+					++param_id) {
 				layers_[layer_id]->set_param_propagate_down(param_id, true);
 			}
 		}
 	}
 	// In the end, all remaining blobs are considered output blobs.
 	for (set<string>::iterator it = available_blobs.begin();
-		it != available_blobs.end(); ++it) {
+			it != available_blobs.end(); ++it) {
 		LOG(INFO) << "This network produces output " << *it;
 		net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
 		net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
@@ -253,9 +253,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
 	LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::FilterNet(const NetParameter& param,
-	NetParameter* param_filtered) {
+		NetParameter* param_filtered) {
 	NetState net_state(param.state());
 	param_filtered->CopyFrom(param);
 	param_filtered->clear_layer();
@@ -263,7 +263,7 @@ void Net<Dtype>::FilterNet(const NetParameter& param,
 		const LayerParameter& layer_param = param.layer(i);
 		const string& layer_name = layer_param.name();
 		CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
-			<< "Specify either include rules or exclude rules; not both.";
+				<< "Specify either include rules or exclude rules; not both.";
 		// If no include rules are specified, the layer is included by default and
 		// only excluded if it meets one of the exclude rules.
 		bool layer_included = (layer_param.include_size() == 0);
@@ -283,15 +283,15 @@ void Net<Dtype>::FilterNet(const NetParameter& param,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 bool Net<Dtype>::StateMeetsRule(const NetState& state,
-	const NetStateRule& rule, const string& layer_name) {
+		const NetStateRule& rule, const string& layer_name) {
 	// Check whether the rule is broken due to phase.
 	if (rule.has_phase()) {
 		if (rule.phase() != state.phase()) {
 			LOG(INFO) << "The NetState phase (" << state.phase()
-				<< ") differed from the phase (" << rule.phase()
-				<< ") specified by a rule in layer " << layer_name;
+					<< ") differed from the phase (" << rule.phase()
+					<< ") specified by a rule in layer " << layer_name;
 			return false;
 		}
 	}
@@ -299,8 +299,8 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 	if (rule.has_min_level()) {
 		if (state.level() < rule.min_level()) {
 			LOG(INFO) << "The NetState level (" << state.level()
-				<< ") is above the min_level (" << rule.min_level()
-				<< ") specified by a rule in layer " << layer_name;
+					<< ") is above the min_level (" << rule.min_level()
+					<< ") specified by a rule in layer " << layer_name;
 			return false;
 		}
 	}
@@ -308,8 +308,8 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 	if (rule.has_max_level()) {
 		if (state.level() > rule.max_level()) {
 			LOG(INFO) << "The NetState level (" << state.level()
-				<< ") is above the max_level (" << rule.max_level()
-				<< ") specified by a rule in layer " << layer_name;
+					<< ") is above the max_level (" << rule.max_level()
+					<< ") specified by a rule in layer " << layer_name;
 			return false;
 		}
 	}
@@ -325,7 +325,7 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 		}
 		if (!has_stage) {
 			LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-				<< "' specified by a rule in layer " << layer_name;
+					<< "' specified by a rule in layer " << layer_name;
 			return false;
 		}
 	}
@@ -341,7 +341,7 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 		}
 		if (has_stage) {
 			LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-				<< "' specified by a rule in layer " << layer_name;
+					<< "' specified by a rule in layer " << layer_name;
 			return false;
 		}
 	}
@@ -350,30 +350,30 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 
 // Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
 // layer_id == -1, tops have layer_id >= 0.)
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
-	const int top_id, set<string>* available_blobs,
-	map<string, int>* blob_name_to_idx) {
+		const int top_id, set<string>* available_blobs,
+		map<string, int>* blob_name_to_idx) {
 	shared_ptr < LayerParameter
-		> layer_param(
-			(layer_id >= 0) ?
-												(new LayerParameter(param.layer(layer_id))) :
-												NULL);
+			> layer_param(
+					(layer_id >= 0) ?
+														(new LayerParameter(param.layer(layer_id))) :
+														NULL);
 	const string& blob_name =
-		layer_param ?
-									(layer_param->top_size() > top_id ?
-																											layer_param->top(top_id) :
-																											"(automatic)") :
-									param.input(top_id);
+			layer_param ?
+					(layer_param->top_size() > top_id ?
+																							layer_param->top(top_id) :
+																							"(automatic)") :
+					param.input(top_id);
 	// Check if we are doing in-place computation
 	if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
-		blob_name == layer_param->bottom(top_id)) {
+			blob_name == layer_param->bottom(top_id)) {
 		// In-place computation
 		LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
 		top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
 		top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
 	} else if (blob_name_to_idx &&
-		blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
+			blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
 		// If we are not doing in-place computation but have duplicated blobs,
 		// raise an error.
 		LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
@@ -396,9 +396,9 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
 			// Set the (explicitly specified) dimensions of the input blob.
 			if (param.input_dim_size() > 0) {
 				blob_pointer->Reshape(param.input_dim(top_id * 4),
-					param.input_dim(top_id * 4 + 1),
-					param.input_dim(top_id * 4 + 2),
-					param.input_dim(top_id * 4 + 3));
+						param.input_dim(top_id * 4 + 1),
+						param.input_dim(top_id * 4 + 2),
+						param.input_dim(top_id * 4 + 3));
 			} else {
 				blob_pointer->Reshape(param.input_shape(top_id));
 			}
@@ -415,15 +415,15 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
 }
 
 // Helper for Net::Init: add a new bottom blob to the net.
-template<typename Dtype>
+template <typename Dtype>
 int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
-	const int bottom_id, set<string>* available_blobs,
-	map<string, int>* blob_name_to_idx) {
+		const int bottom_id, set<string>* available_blobs,
+		map<string, int>* blob_name_to_idx) {
 	const LayerParameter& layer_param = param.layer(layer_id);
 	const string& blob_name = layer_param.bottom(bottom_id);
 	if (available_blobs->find(blob_name) == available_blobs->end()) {
 		LOG(FATAL) << "Unknown blob input " << blob_name
-			<< " (at index " << bottom_id << ") to layer " << layer_id;
+				<< " (at index " << bottom_id << ") to layer " << layer_id;
 	}
 	const int blob_id = (*blob_name_to_idx)[blob_name];
 	LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
@@ -435,18 +435,18 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
 	if (layer_param.propagate_down_size() > 0)
 		propagate_down = layer_param.propagate_down(bottom_id);
 	const bool need_backward = blob_need_backward_[blob_id] &&
-		propagate_down;
+			propagate_down;
 	bottom_need_backward_[layer_id].push_back(need_backward);
 	return blob_id;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
-	const int param_id) {
+		const int param_id) {
 	const LayerParameter& layer_param = layers_[layer_id]->layer_param();
 	const int param_size = layer_param.param_size();
 	string param_name =
-		(param_size > param_id) ? layer_param.param(param_id).name() : "";
+			(param_size > param_id) ? layer_param.param(param_id).name() : "";
 	if (param_name.size()) {
 		param_display_names_.push_back(param_name);
 	} else {
@@ -459,7 +459,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
 	param_id_vecs_[layer_id].push_back(net_param_id);
 	param_layer_indices_.push_back(make_pair(layer_id, param_id));
 	if (!param_size || !param_name.size() || (param_name.size() &&
-		param_names_index_.find(param_name) == param_names_index_.end())) {
+			param_names_index_.find(param_name) == param_names_index_.end())) {
 		// This layer "owns" this parameter blob -- it is either anonymous
 		// (i.e., not given a param_name) or explicitly given a name that we
 		// haven't already seen.
@@ -472,31 +472,31 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
 		const int owner_net_param_id = param_names_index_[param_name];
 		param_owners_.push_back(owner_net_param_id);
 		const pair<int, int>& owner_index =
-			param_layer_indices_[owner_net_param_id];
+				param_layer_indices_[owner_net_param_id];
 		const int owner_layer_id = owner_index.first;
 		const int owner_param_id = owner_index.second;
 		LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-			<< "layer '" << layer_names_[owner_layer_id] << "', param "
-			<< "index " << owner_param_id;
+				<< "layer '" << layer_names_[owner_layer_id] << "', param "
+				<< "index " << owner_param_id;
 		Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get();
 		Blob < Dtype > *owner_blob =
-			layers_[owner_layer_id]->blobs()[owner_param_id].get();
+				layers_[owner_layer_id]->blobs()[owner_param_id].get();
 		const int param_size = layer_param.param_size();
 		if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
-			ParamSpec_DimCheckMode_PERMISSIVE)) {
+				ParamSpec_DimCheckMode_PERMISSIVE)) {
 			// Permissive dimension checking -- only check counts are the same.
 			CHECK_EQ(this_blob->count(), owner_blob->count())
-				<< "Shared parameter blobs must have the same count.";
+					<< "Shared parameter blobs must have the same count.";
 		} else {
 			// Strict dimension checking -- all dims must be the same.
 			CHECK(this_blob->shape() == owner_blob->shape());
 		}
 		layers_[layer_id]->blobs()[param_id]->ShareData(
-			*layers_[owner_layer_id]->blobs()[owner_param_id]);
+				*layers_[owner_layer_id]->blobs()[owner_param_id]);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
 	LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
 	ParamSpec default_param_spec;
@@ -504,15 +504,15 @@ void Net<Dtype>::GetLearningRateAndWeightDecay() {
 		vector < shared_ptr<Blob<Dtype> > > &layer_blobs = layers_[i]->blobs();
 		for (int j = 0; j < layer_blobs.size(); ++j) {
 			const ParamSpec* param_spec =
-				(layers_[i]->layer_param().param_size() > j) ?
-					&layers_[i]->layer_param().param(j) : &default_param_spec;
+					(layers_[i]->layer_param().param_size() > j) ?
+							&layers_[i]->layer_param().param(j) : &default_param_spec;
 			params_lr_.push_back(param_spec->lr_mult());
 			params_weight_decay_.push_back(param_spec->decay_mult());
 		}
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
 	CHECK_GE(start, 0);
 	CHECK_LT(end, layers_.size());
@@ -537,7 +537,7 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
 		clFinish(amdDevice.CommandQueue);
 		layer_timer.Stop();
 		printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
-			layer_timer.MilliSeconds());
+				layer_timer.MilliSeconds());
 	}
 
 	forward_timer.Stop();
@@ -546,17 +546,17 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
 	return loss;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Net<Dtype>::ForwardFrom(int start) {
 	return ForwardFromTo(start, layers_.size() - 1);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype Net<Dtype>::ForwardTo(int end) {
 	return ForwardFromTo(0, end);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
 	if (loss != NULL) {
 		*loss = ForwardFromTo(0, layers_.size() - 1);
@@ -566,9 +566,9 @@ const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
 	return net_output_blobs_;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
-	const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
+		const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
 	// Copy bottom to internal bottom
 	for (int i = 0; i < bottom.size(); ++i) {
 		net_input_blobs_[i]->CopyFrom(*bottom[i]);
@@ -576,13 +576,13 @@ const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
 	return ForwardPrefilled(loss);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
 	BlobProtoVector blob_proto_vec;
 	if (net_input_blobs_.size()) {
 		blob_proto_vec.ParseFromString(input_blob_protos);
 		CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
-			<< "Incorrect input size.";
+				<< "Incorrect input size.";
 		for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
 			net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
 		}
@@ -597,7 +597,7 @@ string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
 	return output;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::BackwardFromTo(int start, int end) {
 	CHECK_GE(end, 0);
 	CHECK_LT(start, layers_.size());
@@ -610,14 +610,14 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
 		layer_timer.Start();
 		if (layer_need_backward_[i]) {
 			layers_[i]->Backward(
-				top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
+					top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
 			if (debug_info_) {
 				BackwardDebugInfo(i);
 			}
 			clFinish(amdDevice.CommandQueue);
 			layer_timer.Start();
 			printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
-				layer_timer.MilliSeconds());
+					layer_timer.MilliSeconds());
 		}
 	}
 
@@ -625,38 +625,38 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
 	printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::InputDebugInfo(const int input_id) {
 	const Blob<Dtype>& blob = *net_input_blobs_[input_id];
 	const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
 	const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
 	LOG(INFO) << "    [Forward] "
-		<< "Input " << blob_name << " data: " << data_abs_val_mean;
+			<< "Input " << blob_name << " data: " << data_abs_val_mean;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
 	for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
 		const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
 		const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
 		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
 		LOG(INFO) << "    [Forward] "
-			<< "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-			<< " data: " << data_abs_val_mean;
+				<< "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
+				<< " data: " << data_abs_val_mean;
 	}
 	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-		++param_id) {
+			++param_id) {
 		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
 		const int net_param_id = param_id_vecs_[layer_id][param_id];
 		const string& blob_name = param_display_names_[net_param_id];
 		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
 		LOG(INFO) << "    [Forward] "
-			<< "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-			<< " data: " << data_abs_val_mean;
+				<< "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
+				<< " data: " << data_abs_val_mean;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
 	const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
 	for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
@@ -667,23 +667,23 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
 		const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
 		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
 		LOG(INFO) << "    [Backward] "
-			<< "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-			<< " diff: " << diff_abs_val_mean;
+				<< "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+				<< " diff: " << diff_abs_val_mean;
 	}
 	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-		++param_id) {
+			++param_id) {
 		if (!layers_[layer_id]->param_propagate_down(param_id)) {
 			continue;
 		}
 		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
 		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
 		LOG(INFO) << "    [Backward] "
-			<< "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-			<< " diff: " << diff_abs_val_mean;
+				<< "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+				<< " diff: " << diff_abs_val_mean;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::UpdateDebugInfo(const int param_id) {
 	const Blob<Dtype>& blob = *params_[param_id];
 	const int param_owner = param_owners_[param_id];
@@ -693,20 +693,20 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
 	if (param_owner < 0) {
 		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
 		LOG(INFO) << "    [Update] Layer " << layer_name
-			<< ", param " << param_display_name
-			<< " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
+				<< ", param " << param_display_name
+				<< " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
 	} else {
 		const string& owner_layer_name =
-			layer_names_[param_layer_indices_[param_owner].first];
+				layer_names_[param_layer_indices_[param_owner].first];
 		LOG(INFO) << "    [Update] Layer " << layer_name
-			<< ", param blob " << param_display_name
-			<< " (owned by layer " << owner_layer_name << ", "
-			<< "param " << param_display_names_[param_owners_[param_id]] << ")"
-			<< " diff: " << diff_abs_val_mean;
+				<< ", param blob " << param_display_name
+				<< " (owned by layer " << owner_layer_name << ", "
+				<< "param " << param_display_names_[param_owners_[param_id]] << ")"
+				<< " diff: " << diff_abs_val_mean;
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
 	int num_source_layers = other->layers().size();
 	for (int i = 0; i < num_source_layers; ++i) {
@@ -714,7 +714,7 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
 		const string& source_layer_name = other->layer_names()[i];
 		int target_layer_id = 0;
 		while (target_layer_id != layer_names_.size() &&
-			layer_names_[target_layer_id] != source_layer_name) {
+				layer_names_[target_layer_id] != source_layer_name) {
 			++target_layer_id;
 		}
 		if (target_layer_id == layer_names_.size()) {
@@ -723,9 +723,9 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
 		}
 		DLOG(INFO) << "Copying source layer " << source_layer_name;
 		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
-			layers_[target_layer_id]->blobs();
+				layers_[target_layer_id]->blobs();
 		CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
-			<< "Incompatible number of blobs for layer " << source_layer_name;
+				<< "Incompatible number of blobs for layer " << source_layer_name;
 		for (int j = 0; j < target_blobs.size(); ++j) {
 			Blob < Dtype > *source_blob = source_layer->blobs()[j].get();
 			CHECK(target_blobs[j]->shape() == source_blob->shape());
@@ -734,17 +734,17 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::BackwardFrom(int start) {
 	BackwardFromTo(start, 0);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::BackwardTo(int end) {
 	BackwardFromTo(layers_.size() - 1, end);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::Backward() {
 	BackwardFromTo(layers_.size() - 1, 0);
 	if (debug_info_) {
@@ -761,19 +761,19 @@ void Net<Dtype>::Backward() {
 		const Dtype l2norm_data = std::sqrt(sumsq_data);
 		const Dtype l2norm_diff = std::sqrt(sumsq_diff);
 		LOG(ERROR) << "    [Backward] All net params (data, diff): "
-			<< "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-			<< "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+				<< "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+				<< "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::Reshape() {
 	for (int i = 0; i < layers_.size(); ++i) {
 		layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 	int num_source_layers = param.layer_size();
 	for (int i = 0; i < num_source_layers; ++i) {
@@ -781,7 +781,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 		const string& source_layer_name = source_layer.name();
 		int target_layer_id = 0;
 		while (target_layer_id != layer_names_.size() &&
-			layer_names_[target_layer_id] != source_layer_name) {
+				layer_names_[target_layer_id] != source_layer_name) {
 			++target_layer_id;
 		}
 		if (target_layer_id == layer_names_.size()) {
@@ -790,9 +790,9 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 		}
 		DLOG(INFO) << "Copying source layer " << source_layer_name;
 		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
-			layers_[target_layer_id]->blobs();
+				layers_[target_layer_id]->blobs();
 		CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
-			<< "Incompatible number of blobs for layer " << source_layer_name;
+				<< "Incompatible number of blobs for layer " << source_layer_name;
 		for (int j = 0; j < target_blobs.size(); ++j) {
 			const bool kReshape = false;
 			target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
@@ -800,14 +800,14 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
 	NetParameter param;
 	ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
 	CopyTrainedLayersFrom(param);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 	param->Clear();
 	param->set_name(name_);
@@ -828,7 +828,7 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Net<Dtype>::Update() {
 	// First, accumulate the diffs of any shared parameters into their owner's
 	// diff. (Assumes that the learning rate, weight decay, etc. have already been
@@ -878,14 +878,14 @@ void Net<Dtype>::Update() {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 bool Net<Dtype>::has_blob(const string& blob_name) const {
 	return blob_names_index_.find(blob_name) != blob_names_index_.end();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
-	const string& blob_name) const {
+		const string& blob_name) const {
 	shared_ptr < Blob<Dtype> > blob_ptr;
 	if (has_blob(blob_name)) {
 		blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
@@ -896,14 +896,14 @@ const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
 	return blob_ptr;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 bool Net<Dtype>::has_layer(const string& layer_name) const {
 	return layer_names_index_.find(layer_name) != layer_names_index_.end();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
-	const string& layer_name) const {
+		const string& layer_name) const {
 	shared_ptr < Layer<Dtype> > layer_ptr;
 	if (has_layer(layer_name)) {
 		layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
index 5da76b7e..f23ff9a3 100644
--- a/src/caffe/ocl/bnll_layer.cl
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -38,7 +38,7 @@ template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLFor
 
 template <class T>
 __kernel void BNLLBackward(const int n, __global const T* in_diff,
-	__global const T* in_data, __global T* out_diff) {
+		__global const T* in_data, __global T* out_diff) {
 	int index = get_global_id(0);
 	if (index < n) {
 		T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
@@ -47,6 +47,6 @@ __kernel void BNLLBackward(const int n, __global const T* in_diff,
 }
 
 template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff,
-	__global const float* in_data, __global float* out_diff);
+		__global const float* in_data, __global float* out_diff);
 template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff,
-	__global const double* in_data, __global double* out_diff);
+		__global const double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
index 2c2c76ee..ba5e1f54 100644
--- a/src/caffe/ocl/concat_layer.cl
+++ b/src/caffe/ocl/concat_layer.cl
@@ -26,29 +26,29 @@
 
 template <class T>
 __kernel void Concat(const int nthreads, __global const T* in_data,
-    const int forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global T* out_data) {
-    int index = get_global_id(0);
-    if(index < nthreads) {
-        const int total_concat_size = concat_size * bottom_concat_axis;
-        const int concat_num = index / total_concat_size;
-        const int concat_index = index % total_concat_size;
-        const int top_index = concat_index +
-            (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-        if (forward == 1) {
-            out_data[top_index] = in_data[index];
-        } else {
-            out_data[index] = in_data[top_index];
-        }
-    }
+		const int forward, const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, __global T* out_data) {
+	int index = get_global_id(0);
+	if(index < nthreads) {
+		const int total_concat_size = concat_size * bottom_concat_axis;
+		const int concat_num = index / total_concat_size;
+		const int concat_index = index % total_concat_size;
+		const int top_index = concat_index +
+		(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+		if (forward == 1) {
+			out_data[top_index] = in_data[index];
+		} else {
+			out_data[index] = in_data[top_index];
+		}
+	}
 }
 
-template __attribute__((mangled_name(Concat_float))) __kernel void  Concat(const int nthreads, __global const float* in_data,
-    const int forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global float* out_data);
-template __attribute__((mangled_name(Concat_double))) __kernel void  Concat(const int nthreads, __global const double* in_data,
-    const int forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, __global double* out_data);
+template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data,
+		const int forward, const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, __global float* out_data);
+template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data,
+		const int forward, const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
index 0aeea80c..b6fdebc7 100644
--- a/src/caffe/ocl/contrastive_loss_layer.cl
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -26,9 +26,9 @@
 
 template <class Dtype>
 __kernel void CLLBackward(const int count, const int channels,
-	const Dtype margin, const bool legacy_version, const Dtype alpha,
-	__global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
-	__global Dtype *bottom_diff) {
+		const Dtype margin, const bool legacy_version, const Dtype alpha,
+		__global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
+		__global Dtype *bottom_diff) {
 	int i = get_global_id(0);
 	if(i < count) {
 		int n = i / channels;  // the num index, to access y and dist_sq
@@ -55,10 +55,10 @@ __kernel void CLLBackward(const int count, const int channels,
 }
 
 template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels,
-	const float margin, const bool legacy_version, const float alpha,
-	__global const float* y, __global const float* diff, __global const float* dist_sq,
-	__global float *bottom_diff);
+		const float margin, const bool legacy_version, const float alpha,
+		__global const float* y, __global const float* diff, __global const float* dist_sq,
+		__global float *bottom_diff);
 template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels,
-	const double margin, const bool legacy_version, const double alpha,
-	__global const double* y, __global const double* diff, __global const double* dist_sq,
-	__global double *bottom_diff);
+		const double margin, const bool legacy_version, const double alpha,
+		__global const double* y, __global const double* diff, __global const double* dist_sq,
+		__global double *bottom_diff);
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
index 0e1812d8..3f60a34f 100644
--- a/src/caffe/ocl/eltwise_layer.cl
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -26,8 +26,8 @@
 
 template <class Dtype>
 __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
-	__global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
-	__global int* mask) {
+		__global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
+		__global int* mask) {
 	int index = get_global_id(0);
 	if(index < nthreads) {
 		Dtype maxval = -FLT_MAX;
@@ -49,15 +49,15 @@ __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a
 	}
 }
 template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a,
-	__global const float* bottom_data_b, const int blob_idx, __global float* top_data,
-	__global int* mask);
+		__global const float* bottom_data_b, const int blob_idx, __global float* top_data,
+		__global int* mask);
 template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a,
-	__global const double* bottom_data_b, const int blob_idx, __global double* top_data,
-	__global int* mask);
+		__global const double* bottom_data_b, const int blob_idx, __global double* top_data,
+		__global int* mask);
 
 template <class Dtype>
 __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
-	const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
+		const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
 	int index = get_global_id(0);
 	if(index < nthreads) {
 		Dtype gradient = 0;
@@ -68,6 +68,6 @@ __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
 	}
 }
 template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff,
-	const int blob_idx, __global const int* mask, __global float* bottom_diff);
+		const int blob_idx, __global const int* mask, __global float* bottom_diff);
 template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff,
-	const int blob_idx, __global const int* mask, __global double* bottom_diff);
+		const int blob_idx, __global const int* mask, __global double* bottom_diff);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index c08d1310..46248024 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -94,11 +94,11 @@ template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_o
 
 template <class T>
 __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	const int height_col, const int width_col,
-	__global T* data_col, const int col_offset) {
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int height_col, const int width_col,
+		__global T* data_col, const int col_offset) {
 	data_im = data_im + img_offset;
 	data_col = data_col + col_offset;
 
@@ -128,22 +128,22 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in
 }
 
 template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
-	const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	const int height_col, const int width_col, __global float* data_col, const int col_offset);
+		const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		const int height_col, const int width_col, __global float* data_col, const int col_offset);
 template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
-	const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	const int height_col, const int width_col, __global double* data_col, const int col_offset);
+		const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		const int height_col, const int width_col, __global double* data_col, const int col_offset);
 
 template <class T>
 __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	const int height_col, const int width_col,
-	__global T* data_im, const int img_offset) {
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int height_col, const int width_col,
+		__global T* data_im, const int img_offset) {
 	data_col = data_col + col_offset;
 	data_im = data_im + img_offset;
 	int index = get_global_id(0);
@@ -172,14 +172,14 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i
 }
 
 template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,const int height_col, const int width_col,
-	__global float* data_im, const int img_offset);
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,const int height_col, const int width_col,
+		__global float* data_im, const int img_offset);
 template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
-	const int col_offset, const int height, const int width, const int channels,
-	const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+		const int col_offset, const int height, const int width, const int channels,
+		const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
 __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) {
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
index 1a53f772..e9938966 100644
--- a/src/caffe/ocl/lrn_layer.cl
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -78,7 +78,7 @@ __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, co
 		}
 	}
 }
-template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k,  __global float* scale);
+template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale);
 template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
 
 template <class T>
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 3162b92e..786ddc16 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -163,11 +163,11 @@ template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void
 
 template <class T>
 __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
-	__global int* mask, __global T* top_mask, const int num,
-	const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-	const int pad_w, __global T* const bottom_diff) {
+		__global int* mask, __global T* top_mask, const int num,
+		const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+		const int pad_w, __global T* const bottom_diff) {
 	int index = get_global_id(0);
 	int total = get_global_size(0);
 	for(index; index < nthreads; index += total) {
@@ -246,11 +246,11 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave
 
 template <class Dtype>
 __kernel void StoPoolBackward(const int nthreads,
-	__global Dtype* rand_idx, __global Dtype* top_diff,
-	const int num, const int channels, const int height,
-	const int width, const int pooled_height, const int pooled_width,
-	const int kernel_h, const int kernel_w, const int stride_h,
-	const int stride_w, __global Dtype* bottom_diff) {
+		__global Dtype* rand_idx, __global Dtype* top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int pooled_height, const int pooled_width,
+		const int kernel_h, const int kernel_w, const int stride_h,
+		const int stride_w, __global Dtype* bottom_diff) {
 	int index = get_global_id(0);
 	int total = get_global_size(0);
 	for(index; index < nthreads; index += total) {
@@ -279,15 +279,15 @@ __kernel void StoPoolBackward(const int nthreads,
 
 	}
 }
-template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel  void StoPoolBackward(const int nthreads,
-    __global float* rand_idx, __global float* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global float* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads,
+		__global float* rand_idx, __global float* top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int pooled_height, const int pooled_width,
+		const int kernel_h, const int kernel_w, const int stride_h,
+		const int stride_w, __global float* bottom_diff);
 template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads,
-    __global double* rand_idx, __global double* top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, __global double* bottom_diff);
+		__global double* rand_idx, __global double* top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int pooled_height, const int pooled_width,
+		const int kernel_h, const int kernel_w, const int stride_h,
+		const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index 5fbea781..de46a5da 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -41,7 +41,7 @@ __kernel void PReLUBackward(const int count, const int channels, const int dim,
 	if(index < count) {
 		int c = (index / dim) % channels / div_factor;
 		out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-			+ (in_data[index] <= 0) * slope_data[c]);
+				+ (in_data[index] <= 0) * slope_data[c]);
 	}
 }
 template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index 94a41db4..57b40dfe 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -47,9 +47,9 @@ enum r123_enum_threefry32x4
 };
 
 inline uint32_t RotL_32(uint32_t x, unsigned int N)
-	__attribute__((always_inline));
+		__attribute__((always_inline));
 inline uint32_t RotL_32(uint32_t x, unsigned int N)
-	{
+		{
 	return (x << (N & 31)) | (x >> ((32 - N) & 31));
 }
 
@@ -58,10 +58,10 @@ typedef struct r123array4x32 threefry4x32_key_t;
 typedef struct r123array4x32 threefry4x32_ukey_t;
 
 inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
-	threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
+		threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
 inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
-	threefry4x32_ctr_t in, threefry4x32_key_t k)
-	{
+		threefry4x32_ctr_t in, threefry4x32_key_t k)
+		{
 	threefry4x32_ctr_t X;
 	uint32_t ks[4 + 1];
 	int i;
@@ -95,7 +95,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
 	X.v[2] += ks[2];
 	X.v[3] += ks[3];
 	if (Nrounds > 0)
-		{
+			{
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
@@ -802,13 +802,13 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
 
 template <class T>
 __kernel void PRNG_threefry4x32(
-	__global uint4 *randomnumber,
-	threefry4x32_ctr_t ctr_i,
-	T inf,
-	T sup,
-	T threshold,
-	uint nrounds,
-	uint numrandom
+		__global uint4 *randomnumber,
+		threefry4x32_ctr_t ctr_i,
+		T inf,
+		T sup,
+		T threshold,
+		uint nrounds,
+		uint numrandom
 ) {
 	size_t gdx = get_global_id(0);
 
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
index 4069ce16..6fe0daab 100644
--- a/src/caffe/ocl/softmax_layer.cl
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -75,7 +75,7 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma
 
 template <class T>
 __kernel void kernel_channel_max(const int num, const int channels,
-	const int spatial_dim, __global const T* data, __global T* out) {
+		const int spatial_dim, __global const T* data, __global T* out) {
 	int index = get_global_id(0);
 	if(index < num * spatial_dim) {
 		int n = index / spatial_dim;
@@ -89,14 +89,14 @@ __kernel void kernel_channel_max(const int num, const int channels,
 }
 
 template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
-	const int spatial_dim, __global const float* data, __global float* out);
+		const int spatial_dim, __global const float* data, __global float* out);
 template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
-	const int spatial_dim, __global const double* data, __global double* out);
+		const int spatial_dim, __global const double* data, __global double* out);
 
 template <class T>
 __kernel void kernel_channel_subtract(const int count,
-	const int num, const int channels,
-	const int spatial_dim, __global const T* channel_max, __global T* data) {
+		const int num, const int channels,
+		const int spatial_dim, __global const T* channel_max, __global T* data) {
 	int index = get_global_id(0);
 	if(index < count) {
 		int n = index / channels / spatial_dim;
@@ -109,7 +109,7 @@ template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel
 
 template <class T>
 __kernel void kernel_channel_sum(const int num, const int channels,
-	const int spatial_dim, __global const T* data, __global T* channel_sum) {
+		const int spatial_dim, __global const T* data, __global T* channel_sum) {
 	int index = get_global_id(0);
 	if(index < num * spatial_dim) {
 		int n = index / spatial_dim;
@@ -123,14 +123,14 @@ __kernel void kernel_channel_sum(const int num, const int channels,
 }
 
 template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
-	const int spatial_dim, __global const float* data, __global float* channel_sum);
+		const int spatial_dim, __global const float* data, __global float* channel_sum);
 template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
-	const int spatial_dim, __global const double* data, __global double* channel_sum);
+		const int spatial_dim, __global const double* data, __global double* channel_sum);
 
 template <class T>
 __kernel void kernel_channel_div(const int count,
-	const int num, const int channels,
-	const int spatial_dim, __global const T* channel_sum, __global T* data) {
+		const int num, const int channels,
+		const int spatial_dim, __global const T* channel_sum, __global T* data) {
 	int index = get_global_id(0);
 	if(index < count) {
 		int n = index / channels / spatial_dim;
@@ -140,16 +140,16 @@ __kernel void kernel_channel_div(const int count,
 }
 
 template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
-	const int num, const int channels,
-	const int spatial_dim, __global const float* channel_sum, __global float* data);
+		const int num, const int channels,
+		const int spatial_dim, __global const float* channel_sum, __global float* data);
 template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
-	const int num, const int channels,
-	const int spatial_dim, __global const double* channel_sum, __global double* data);
+		const int num, const int channels,
+		const int spatial_dim, __global const double* channel_sum, __global double* data);
 
 template <class T>
 __kernel void kernel_channel_dot(const int num, const int channels,
-	const int spatial_dim, __global const T* data_1, __global const T* data_2,
-	__global T* channel_dot) {
+		const int spatial_dim, __global const T* data_1, __global const T* data_2,
+		__global T* channel_dot) {
 	int index = get_global_id(0);
 	if(index < num * spatial_dim) {
 		int n = index / spatial_dim;
@@ -157,15 +157,15 @@ __kernel void kernel_channel_dot(const int num, const int channels,
 		T dot = 0;
 		for (int c = 0; c < channels; ++c) {
 			dot += (data_1[(n * channels + c) * spatial_dim + s]
-				* data_2[(n * channels + c) * spatial_dim + s]);
+					* data_2[(n * channels + c) * spatial_dim + s]);
 		}
 		channel_dot[index] = dot;
 	}
 }
 
 template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
-	const int spatial_dim, __global const float* data_1, __global const float* data_2,
-	__global float* channel_dot);
+		const int spatial_dim, __global const float* data_1, __global const float* data_2,
+		__global float* channel_dot);
 template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
-	const int spatial_dim, __global const double* data_1, __global const double* data_2,
-	__global double* channel_dot);
+		const int spatial_dim, __global const double* data_1, __global const double* data_2,
+		__global double* channel_dot);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index 025f59ac..70c282e1 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -26,10 +26,10 @@
 
 template <class T>
 __kernel void SoftmaxLossForwardGPU(const int nthreads,
-	__global T* prob_data, __global T* label,__global T* loss,
-	int num, int dim, int spatial_dim,
-	bool has_ignore_label_, int ignore_label_,
-	__global T* counts) {
+		__global T* prob_data, __global T* label,__global T* loss,
+		int num, int dim, int spatial_dim,
+		bool has_ignore_label_, int ignore_label_,
+		__global T* counts) {
 	int index = get_global_id(0);
 	if(index < nthreads) {
 		const int n = index / spatial_dim;
@@ -40,28 +40,28 @@ __kernel void SoftmaxLossForwardGPU(const int nthreads,
 			counts[index] = 0;
 		} else {
 			loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-					T(FLT_MIN)));
+							T(FLT_MIN)));
 			counts[index] = 1;
 		}
 	}
 }
 
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-	__global float* prob_data, __global float* label,__global float* loss,
-	int num, int dim, int spatial_dim,
-	bool has_ignore_label_, int ignore_label_,
-	__global float* counts);
+		__global float* prob_data, __global float* label,__global float* loss,
+		int num, int dim, int spatial_dim,
+		bool has_ignore_label_, int ignore_label_,
+		__global float* counts);
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-	__global double* prob_data, __global double* label,__global double* loss,
-	int num, int dim, int spatial_dim,
-	bool has_ignore_label_, int ignore_label_,
-	__global double* counts);
+		__global double* prob_data, __global double* label,__global double* loss,
+		int num, int dim, int spatial_dim,
+		bool has_ignore_label_, int ignore_label_,
+		__global double* counts);
 
 template <class T>
 __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
-	__global T* label,__global T* bottom_diff, int num, int dim,
-	int spatial_dim, bool has_ignore_label_,
-	int ignore_label_, T* counts) {
+		__global T* label,__global T* bottom_diff, int num, int dim,
+		int spatial_dim, bool has_ignore_label_,
+		int ignore_label_, T* counts) {
 	const int channels = dim / spatial_dim;
 	int index = get_global_id(0);
 	if(index < nthreads) {
@@ -81,14 +81,14 @@ __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
 	}
 }
 template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
-	__global float* label,__global float* bottom_diff, int num, int dim,
-	int spatial_dim, bool has_ignore_label_,
-	int ignore_label_, float* counts);
+		__global float* label,__global float* bottom_diff, int num, int dim,
+		int spatial_dim, bool has_ignore_label_,
+		int ignore_label_, float* counts);
 
 template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
-	__global double* label,__global double* bottom_diff, int num, int dim,
-	int spatial_dim, bool has_ignore_label_,
-	int ignore_label_, double* counts);
+		__global double* label,__global double* bottom_diff, int num, int dim,
+		int spatial_dim, bool has_ignore_label_,
+		int ignore_label_, double* counts);
 
 template <class T>
 __kernel void scal (const int num, const T alpha, __global T* data) {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ae675500..0a07a218 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -13,13 +13,14 @@
 #include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 Solver<Dtype>::Solver(const SolverParameter& param)
-	: net_() {
+:
+		net_() {
 	Init(param);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::ocl_setup() {
 	scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
 	add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
@@ -27,18 +28,19 @@ void Solver<Dtype>::ocl_setup() {
 	powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
-	: net_() {
+:
+		net_() {
 	SolverParameter param;
 	ReadProtoFromTextFileOrDie(param_file, &param);
 	Init(param);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
 	LOG(INFO) << "Initializing solver from parameters: " << std::endl
-		<< param.DebugString();
+			<< param.DebugString();
 	param_ = param;
 	CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
 
@@ -55,22 +57,22 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
 	current_step_ = 0;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
 	const int num_train_nets = param_.has_net() + param_.has_net_param() +
-		param_.has_train_net() + param_.has_train_net_param();
+			param_.has_train_net() + param_.has_train_net_param();
 	const string& field_names = "net, net_param, train_net, train_net_param";
 	CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
-		<< "using one of these fields: " << field_names;
+			<< "using one of these fields: " << field_names;
 	CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
-		<< "one of these fields specifying a train_net: " << field_names;
+			<< "one of these fields specifying a train_net: " << field_names;
 	NetParameter net_param;
 	if (param_.has_train_net_param()) {
 		LOG(INFO) << "Creating training net specified in train_net_param.";
 		net_param.CopyFrom(param_.train_net_param());
 	} else if (param_.has_train_net()) {
 		LOG(INFO) << "Creating training net from train_net file: "
-			<< param_.train_net();
+				<< param_.train_net();
 		ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
 	}
 	if (param_.has_net_param()) {
@@ -93,22 +95,22 @@ void Solver<Dtype>::InitTrainNet() {
 	net_.reset(new Net<Dtype>(net_param));
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
 	const bool has_net_param = param_.has_net_param();
 	const bool has_net_file = param_.has_net();
 	const int num_generic_nets = has_net_param + has_net_file;
 	CHECK_LE(num_generic_nets, 1)
-		<< "Both net_param and net_file may not be specified.";
+			<< "Both net_param and net_file may not be specified.";
 	const int num_test_net_params = param_.test_net_param_size();
 	const int num_test_net_files = param_.test_net_size();
 	const int num_test_nets = num_test_net_params + num_test_net_files;
 	if (num_generic_nets) {
 		CHECK_GE(param_.test_iter_size(), num_test_nets)
-			<< "test_iter must be specified for each test network.";
+				<< "test_iter must be specified for each test network.";
 	} else {
 		CHECK_EQ(param_.test_iter_size(), num_test_nets)
-			<< "test_iter must be specified for each test network.";
+				<< "test_iter must be specified for each test network.";
 	}
 	// If we have a generic net (specified by net or net_param, rather than
 	// test_net or test_net_param), we may have an unlimited number of actual
@@ -119,7 +121,7 @@ void Solver<Dtype>::InitTestNets() {
 	const int num_test_net_instances = num_test_nets + num_generic_net_instances;
 	if (param_.test_state_size()) {
 		CHECK_EQ(param_.test_state_size(), num_test_net_instances)
-			<< "test_state must be unspecified or specified once per test net.";
+				<< "test_state must be unspecified or specified once per test net.";
 	}
 	if (num_test_net_instances) {
 		CHECK_GT(param_.test_interval(), 0);
@@ -134,7 +136,7 @@ void Solver<Dtype>::InitTestNets() {
 	for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
 		sources[test_net_id] = "test_net file: " + param_.test_net(i);
 		ReadNetParamsFromTextFileOrDie(param_.test_net(i),
-			&net_params[test_net_id]);
+				&net_params[test_net_id]);
 	}
 	const int remaining_test_nets = param_.test_iter_size() - test_net_id;
 	if (has_net_param) {
@@ -163,13 +165,13 @@ void Solver<Dtype>::InitTestNets() {
 		}
 		net_params[i].mutable_state()->CopyFrom(net_state);
 		LOG(INFO)
-			<< "Creating test net (#" << i << ") specified by " << sources[i];
+				<< "Creating test net (#" << i << ") specified by " << sources[i];
 		test_nets_[i].reset(new Net<Dtype>(net_params[i]));
 		test_nets_[i]->set_debug_info(param_.debug_info());
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Step(int iters) {
 	vector<Blob<Dtype>*> bottom_vec;
 	const int start_iter = iter_;
@@ -185,19 +187,19 @@ void Solver<Dtype>::Step(int iters) {
 			switch (Caffe::mode()) {
 				case Caffe::CPU:
 					caffe_set(blob->count(), static_cast<Dtype>(0),
-						blob->mutable_cpu_diff());
+							blob->mutable_cpu_diff());
 					break;
 				case Caffe::GPU:
 					#ifndef CPU_ONLY
 					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-						blob->mutable_gpu_diff());
+							blob->mutable_gpu_diff());
 #else
 					NO_GPU;
 #endif
 				case Caffe::APU:
 					#ifndef CPU_ONLY
 					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-						blob->mutable_gpu_diff());
+							blob->mutable_gpu_diff());
 #else
 					NO_GPU;
 #endif
@@ -206,7 +208,7 @@ void Solver<Dtype>::Step(int iters) {
 		}
 
 		if (param_.test_interval() && iter_ % param_.test_interval() == 0
-			&& (iter_ > 0 || param_.test_initialization())) {
+				&& (iter_ > 0 || param_.test_initialization())) {
 			TestAll();
 		}
 
@@ -228,10 +230,10 @@ void Solver<Dtype>::Step(int iters) {
 			smoothed_loss += (loss - losses[idx]) / average_loss;
 			losses[idx] = loss;
 			printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss,
-				losses[idx], idx);
+					losses[idx], idx);
 		}
 		printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n",
-			smoothed_loss, average_loss, losses.size());
+				smoothed_loss, average_loss, losses.size());
 		if (display) {
 			LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
 			const vector<Blob<Dtype>*>& result = net_->output_blobs();
@@ -239,18 +241,18 @@ void Solver<Dtype>::Step(int iters) {
 			for (int j = 0; j < result.size(); ++j) {
 				const Dtype* result_vec = result[j]->cpu_data();
 				const string& output_name =
-					net_->blob_names()[net_->output_blob_indices()[j]];
+						net_->blob_names()[net_->output_blob_indices()[j]];
 				const Dtype loss_weight =
-					net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+						net_->blob_loss_weights()[net_->output_blob_indices()[j]];
 				for (int k = 0; k < result[j]->count(); ++k) {
 					ostringstream loss_msg_stream;
 					if (loss_weight) {
 						loss_msg_stream << " (* " << loss_weight
-							<< " = " << loss_weight * result_vec[k] << " loss)";
+								<< " = " << loss_weight * result_vec[k] << " loss)";
 					}
 					LOG(INFO) << "    Train net output #"
-						<< score_index++ << ": " << output_name << " = "
-						<< result_vec[k] << loss_msg_stream.str();
+							<< score_index++ << ": " << output_name << " = "
+							<< result_vec[k] << loss_msg_stream.str();
 				}
 			}
 		}
@@ -267,7 +269,7 @@ void Solver<Dtype>::Step(int iters) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Solve(const char* resume_file) {
 	LOG(INFO) << "Solving " << net_->name();
 	LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
@@ -283,7 +285,7 @@ void Solver<Dtype>::Solve(const char* resume_file) {
 	// If we haven't already, save a snapshot after optimization, unless
 	// overridden by setting snapshot_after_train := false
 	if (param_.snapshot_after_train()
-		&& (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
+			&& (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
 		Snapshot();
 	}
 	// After the optimization is done, run an additional train and test pass to
@@ -303,19 +305,19 @@ void Solver<Dtype>::Solve(const char* resume_file) {
 	LOG(INFO) << "Optimization Done.";
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::TestAll() {
 	for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
 		Test(test_net_id);
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
 	LOG(INFO) << "Iteration " << iter_
-		<< ", Testing net (#" << test_net_id << ")";
+			<< ", Testing net (#" << test_net_id << ")";
 	CHECK_NOTNULL(test_nets_[test_net_id].get())->
-		ShareTrainedLayersWith(net_.get());
+			ShareTrainedLayersWith(net_.get());
 	vector < Dtype > test_score;
 	vector<int> test_score_output_id;
 	vector<Blob<Dtype>*> bottom_vec;
@@ -324,7 +326,7 @@ void Solver<Dtype>::Test(const int test_net_id) {
 	for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
 		Dtype iter_loss;
 		const vector<Blob<Dtype>*>& result =
-			test_net->Forward(bottom_vec, &iter_loss);
+				test_net->Forward(bottom_vec, &iter_loss);
 		if (param_.test_compute_loss()) {
 			loss += iter_loss;
 		}
@@ -352,21 +354,21 @@ void Solver<Dtype>::Test(const int test_net_id) {
 	}
 	for (int i = 0; i < test_score.size(); ++i) {
 		const int output_blob_index =
-			test_net->output_blob_indices()[test_score_output_id[i]];
+				test_net->output_blob_indices()[test_score_output_id[i]];
 		const string& output_name = test_net->blob_names()[output_blob_index];
 		const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
 		ostringstream loss_msg_stream;
 		const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
 		if (loss_weight) {
 			loss_msg_stream << " (* " << loss_weight
-				<< " = " << loss_weight * mean_score << " loss)";
+					<< " = " << loss_weight * mean_score << " loss)";
 		}
 		LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-			<< mean_score << loss_msg_stream.str();
+				<< mean_score << loss_msg_stream.str();
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Snapshot() {
 	NetParameter net_param;
 	// For intermediate results, we will also dump the gradient values.
@@ -390,7 +392,7 @@ void Solver<Dtype>::Snapshot() {
 	WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
 	SolverState state;
 	NetParameter net_param;
@@ -419,7 +421,7 @@ void Solver<Dtype>::Restore(const char* state_file) {
 //
 // where base_lr, max_iter, gamma, step, stepvalue and power are defined
 // in the solver parameter protocol buffer, and iter is the current iteration.
-template<typename Dtype>
+template <typename Dtype>
 Dtype SGDSolver<Dtype>::GetLearningRate() {
 	Dtype rate;
 	const string& lr_policy = this->param_.lr_policy();
@@ -428,37 +430,37 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
 	} else if (lr_policy == "step") {
 		this->current_step_ = this->iter_ / this->param_.stepsize();
 		rate = this->param_.base_lr() *
-			pow(this->param_.gamma(), this->current_step_);
+				pow(this->param_.gamma(), this->current_step_);
 	} else if (lr_policy == "exp") {
 		rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
 	} else if (lr_policy == "inv") {
 		rate = this->param_.base_lr() *
-			pow(Dtype(1) + this->param_.gamma() * this->iter_,
-				-this->param_.power());
+				pow(Dtype(1) + this->param_.gamma() * this->iter_,
+						-this->param_.power());
 	} else if (lr_policy == "multistep") {
 		if (this->current_step_ < this->param_.stepvalue_size() &&
-			this->iter_ >= this->param_.stepvalue(this->current_step_)) {
+				this->iter_ >= this->param_.stepvalue(this->current_step_)) {
 			this->current_step_++;
 			LOG(INFO) << "MultiStep Status: Iteration " <<
-				this->iter_ << ", step = " << this->current_step_;
+					this->iter_ << ", step = " << this->current_step_;
 		}
 		rate = this->param_.base_lr() *
-			pow(this->param_.gamma(), this->current_step_);
+				pow(this->param_.gamma(), this->current_step_);
 	} else if (lr_policy == "poly") {
 		rate = this->param_.base_lr() * pow(Dtype(1.) -
-			(Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-			this->param_.power());
+				(Dtype(this->iter_) / Dtype(this->param_.max_iter())),
+				this->param_.power());
 	} else if (lr_policy == "sigmoid") {
 		rate = this->param_.base_lr() * (Dtype(1.) /
-			(Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-				Dtype(this->param_.stepsize())))));
+				(Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
+						Dtype(this->param_.stepsize())))));
 	} else {
 		LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
 	}
 	return rate;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::PreSolve() {
 	// Initialize the history
 	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
@@ -473,7 +475,7 @@ void SGDSolver<Dtype>::PreSolve() {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::ClipGradients() {
 	const Dtype clip_gradients = this->param_.clip_gradients();
 	if (clip_gradients < 0) {
@@ -490,8 +492,8 @@ void SGDSolver<Dtype>::ClipGradients() {
 	if (l2norm_diff > clip_gradients) {
 		Dtype scale_factor = clip_gradients / l2norm_diff;
 		LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-			<< l2norm_diff << " > " << clip_gradients << ") "
-			<< "by scale factor " << scale_factor;
+				<< l2norm_diff << " > " << clip_gradients << ") "
+				<< "by scale factor " << scale_factor;
 		for (int i = 0; i < net_params.size(); ++i) {
 			if (this->net_->param_owners()[i] < 0) {
 				net_params[i]->scale_diff(scale_factor);
@@ -500,7 +502,7 @@ void SGDSolver<Dtype>::ClipGradients() {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
 	Dtype rate = GetLearningRate();
 	if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
@@ -515,7 +517,7 @@ void SGDSolver<Dtype>::ApplyUpdate() {
 	this->net_->Update();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
 	if (this->param_.iter_size() == 1) {
 		return;
@@ -526,13 +528,13 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
 	switch (Caffe::mode()) {
 		case Caffe::CPU: {
 			caffe_scal(net_params[param_id]->count(), accum_normalization,
-				net_params[param_id]->mutable_cpu_diff());
+					net_params[param_id]->mutable_cpu_diff());
 			break;
 		}
 		case Caffe::GPU: {
 #ifndef CPU_ONLY
 			caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
-				net_params[param_id]->mutable_gpu_diff());
+					net_params[param_id]->mutable_gpu_diff());
 #else
 			NO_GPU;
 #endif
@@ -543,11 +545,11 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::Regularize(int param_id) {
 	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
 	const vector<float>& net_params_weight_decay =
-		this->net_->params_weight_decay();
+			this->net_->params_weight_decay();
 	Dtype weight_decay = this->param_.weight_decay();
 	string regularization_type = this->param_.regularization_type();
 	Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
@@ -558,17 +560,17 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
 				if (regularization_type == "L2") {
 					// add weight decay
 					caffe_axpy(net_params[param_id]->count(),
-						local_decay,
-						net_params[param_id]->cpu_data(),
-						net_params[param_id]->mutable_cpu_diff());
+							local_decay,
+							net_params[param_id]->cpu_data(),
+							net_params[param_id]->mutable_cpu_diff());
 				} else if (regularization_type == "L1") {
 					caffe_cpu_sign(net_params[param_id]->count(),
-						net_params[param_id]->cpu_data(),
-						temp_[param_id]->mutable_cpu_data());
+							net_params[param_id]->cpu_data(),
+							temp_[param_id]->mutable_cpu_data());
 					caffe_axpy(net_params[param_id]->count(),
-						local_decay,
-						temp_[param_id]->cpu_data(),
-						net_params[param_id]->mutable_cpu_diff());
+							local_decay,
+							temp_[param_id]->cpu_data(),
+							net_params[param_id]->mutable_cpu_diff());
 				} else {
 					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
 				}
@@ -581,17 +583,17 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
 				if (regularization_type == "L2") {
 					// add weight decay
 					caffe_gpu_axpy(net_params[param_id]->count(),
-						local_decay,
-						net_params[param_id]->gpu_data(),
-						net_params[param_id]->mutable_gpu_diff());
+							local_decay,
+							net_params[param_id]->gpu_data(),
+							net_params[param_id]->mutable_gpu_diff());
 				} else if (regularization_type == "L1") {
 					caffe_gpu_sign(net_params[param_id]->count(),
-						net_params[param_id]->gpu_data(),
-						temp_[param_id]->mutable_gpu_data());
+							net_params[param_id]->gpu_data(),
+							temp_[param_id]->mutable_gpu_data());
 					caffe_gpu_axpy(net_params[param_id]->count(),
-						local_decay,
-						temp_[param_id]->gpu_data(),
-						net_params[param_id]->mutable_gpu_diff());
+							local_decay,
+							temp_[param_id]->gpu_data(),
+							net_params[param_id]->mutable_gpu_diff());
 				} else {
 					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
 				}
@@ -606,7 +608,7 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
 	const vector<float>& net_params_lr = this->net_->params_lr();
@@ -616,21 +618,21 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	switch (Caffe::mode()) {
 		case Caffe::CPU: {
 			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-				net_params[param_id]->cpu_diff(), momentum,
-				history_[param_id]->mutable_cpu_data());
+					net_params[param_id]->cpu_diff(), momentum,
+					history_[param_id]->mutable_cpu_data());
 			caffe_copy(net_params[param_id]->count(),
-				history_[param_id]->cpu_data(),
-				net_params[param_id]->mutable_cpu_diff());
+					history_[param_id]->cpu_data(),
+					net_params[param_id]->mutable_cpu_diff());
 			break;
 		}
 		case Caffe::GPU: {
 #ifndef CPU_ONLY
 			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-				net_params[param_id]->gpu_diff(), momentum,
-				history_[param_id]->mutable_gpu_data());
+					net_params[param_id]->gpu_diff(), momentum,
+					history_[param_id]->mutable_gpu_data());
 			caffe_gpu_copy(net_params[param_id]->count(),
-				history_[param_id]->gpu_data(),
-				net_params[param_id]->mutable_gpu_diff());
+					history_[param_id]->gpu_data(),
+					net_params[param_id]->mutable_gpu_diff());
 #else
 			NO_GPU;
 #endif
@@ -641,7 +643,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
 	state->clear_history();
 	for (int i = 0; i < history_.size(); ++i) {
@@ -651,17 +653,17 @@ void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
 	CHECK_EQ(state.history_size(), history_.size())
-		<< "Incorrect length of history blobs.";
+			<< "Incorrect length of history blobs.";
 	LOG(INFO) << "SGDSolver: restoring history";
 	for (int i = 0; i < history_.size(); ++i) {
 		history_[i]->FromProto(state.history(i));
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
 	const vector<float>& net_params_lr = this->net_->params_lr();
@@ -671,46 +673,46 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 		case Caffe::CPU: {
 			// save history momentum for stepping back
 			caffe_copy(net_params[param_id]->count(),
-				this->history_[param_id]->cpu_data(),
-				this->update_[param_id]->mutable_cpu_data());
+					this->history_[param_id]->cpu_data(),
+					this->update_[param_id]->mutable_cpu_data());
 
 			// update history
 			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-				net_params[param_id]->cpu_diff(), momentum,
-				this->history_[param_id]->mutable_cpu_data());
+					net_params[param_id]->cpu_diff(), momentum,
+					this->history_[param_id]->mutable_cpu_data());
 
 			// compute update: step back then over step
 			caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-				this->history_[param_id]->cpu_data(), -momentum,
-				this->update_[param_id]->mutable_cpu_data());
+					this->history_[param_id]->cpu_data(), -momentum,
+					this->update_[param_id]->mutable_cpu_data());
 
 			// copy
 			caffe_copy(net_params[param_id]->count(),
-				this->update_[param_id]->cpu_data(),
-				net_params[param_id]->mutable_cpu_diff());
+					this->update_[param_id]->cpu_data(),
+					net_params[param_id]->mutable_cpu_diff());
 			break;
 		}
 		case Caffe::GPU: {
 #ifndef CPU_ONLY
 			// save history momentum for stepping back
 			caffe_copy(net_params[param_id]->count(),
-				this->history_[param_id]->gpu_data(),
-				this->update_[param_id]->mutable_gpu_data());
+					this->history_[param_id]->gpu_data(),
+					this->update_[param_id]->mutable_gpu_data());
 
 			// update history
 			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-				net_params[param_id]->gpu_diff(), momentum,
-				this->history_[param_id]->mutable_gpu_data());
+					net_params[param_id]->gpu_diff(), momentum,
+					this->history_[param_id]->mutable_gpu_data());
 
 			// compute update: step back then over step
 			caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-				this->history_[param_id]->gpu_data(), -momentum,
-				this->update_[param_id]->mutable_gpu_data());
+					this->history_[param_id]->gpu_data(), -momentum,
+					this->update_[param_id]->mutable_gpu_data());
 
 			// copy
 			caffe_gpu_copy(net_params[param_id]->count(),
-				this->update_[param_id]->gpu_data(),
-				net_params[param_id]->mutable_gpu_diff());
+					this->update_[param_id]->gpu_data(),
+					net_params[param_id]->mutable_gpu_diff());
 #else
 			NO_GPU;
 #endif
@@ -721,7 +723,7 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
 	const vector<float>& net_params_lr = this->net_->params_lr();
@@ -731,64 +733,64 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 		case Caffe::CPU: {
 			// compute square of gradient in update
 			caffe_powx(net_params[param_id]->count(),
-				net_params[param_id]->cpu_diff(), Dtype(2),
-				this->update_[param_id]->mutable_cpu_data());
+					net_params[param_id]->cpu_diff(), Dtype(2),
+					this->update_[param_id]->mutable_cpu_data());
 
 			// update history
 			caffe_add(net_params[param_id]->count(),
-				this->update_[param_id]->cpu_data(),
-				this->history_[param_id]->cpu_data(),
-				this->history_[param_id]->mutable_cpu_data());
+					this->update_[param_id]->cpu_data(),
+					this->history_[param_id]->cpu_data(),
+					this->history_[param_id]->mutable_cpu_data());
 
 			// prepare update
 			caffe_powx(net_params[param_id]->count(),
-				this->history_[param_id]->cpu_data(), Dtype(0.5),
-				this->update_[param_id]->mutable_cpu_data());
+					this->history_[param_id]->cpu_data(), Dtype(0.5),
+					this->update_[param_id]->mutable_cpu_data());
 
 			caffe_add_scalar(net_params[param_id]->count(),
-				delta, this->update_[param_id]->mutable_cpu_data());
+					delta, this->update_[param_id]->mutable_cpu_data());
 
 			caffe_div(net_params[param_id]->count(),
-				net_params[param_id]->cpu_diff(),
-				this->update_[param_id]->cpu_data(),
-				this->update_[param_id]->mutable_cpu_data());
+					net_params[param_id]->cpu_diff(),
+					this->update_[param_id]->cpu_data(),
+					this->update_[param_id]->mutable_cpu_data());
 
 			// scale and copy
 			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-				this->update_[param_id]->cpu_data(), Dtype(0),
-				net_params[param_id]->mutable_cpu_diff());
+					this->update_[param_id]->cpu_data(), Dtype(0),
+					net_params[param_id]->mutable_cpu_diff());
 			break;
 		}
 		case Caffe::GPU: {
 #ifndef CPU_ONLY
 			// compute square of gradient in update
 			caffe_gpu_powx(net_params[param_id]->count(),
-				net_params[param_id]->gpu_diff(), Dtype(2),
-				this->update_[param_id]->mutable_gpu_data());
+					net_params[param_id]->gpu_diff(), Dtype(2),
+					this->update_[param_id]->mutable_gpu_data());
 
 			// update history
 			caffe_gpu_add(net_params[param_id]->count(),
-				this->update_[param_id]->gpu_data(),
-				this->history_[param_id]->gpu_data(),
-				this->history_[param_id]->mutable_gpu_data());
+					this->update_[param_id]->gpu_data(),
+					this->history_[param_id]->gpu_data(),
+					this->history_[param_id]->mutable_gpu_data());
 
 			// prepare update
 			caffe_gpu_powx(net_params[param_id]->count(),
-				this->history_[param_id]->gpu_data(), Dtype(0.5),
-				this->update_[param_id]->mutable_gpu_data());
+					this->history_[param_id]->gpu_data(), Dtype(0.5),
+					this->update_[param_id]->mutable_gpu_data());
 
 			caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(),
-				delta, this->update_[param_id]->mutable_gpu_data());
+					delta, this->update_[param_id]->mutable_gpu_data());
 
 			caffe_gpu_div(net_params[param_id]->count(),
-				net_params[param_id]->gpu_diff(),
-				this->update_[param_id]->gpu_data(),
-				this->update_[param_id]->mutable_gpu_data());
+					net_params[param_id]->gpu_diff(),
+					this->update_[param_id]->gpu_data(),
+					this->update_[param_id]->mutable_gpu_data());
 
 			// scale and copy
 			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-				this->update_[param_id]->gpu_data(), Dtype(0),
-				net_params[param_id]->mutable_gpu_diff());
+					this->update_[param_id]->gpu_data(), Dtype(0),
+					net_params[param_id]->mutable_gpu_diff());
 #else
 			NO_GPU;
 #endif
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 94d62e0e..67f5984b 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -38,8 +38,8 @@ namespace caffe {
 SyncedMemory::~SyncedMemory() {
 	if (cpu_ptr_ && own_cpu_data_) {
 		OCL_CHECK(
-			clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
-				cpu_ptr_, 0, NULL, NULL));
+				clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+						cpu_ptr_, 0, NULL, NULL));
 		clFinish(amdDevice.CommandQueue);
 	}
 	if (gpu_cache_ptr_ && own_cpu_data_) {
@@ -62,11 +62,12 @@ inline void SyncedMemory::to_cpu() {
 	switch (head_) {
 		case UNINITIALIZED:
 			gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
-				size_, NULL, NULL);
+					size_, NULL, NULL);
 			//}
 			cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
-				(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_,
-				0, NULL, NULL, NULL);
+					(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+					size_,
+					0, NULL, NULL, NULL);
 			memset(cpu_ptr_, 0, size_);
 			head_ = HEAD_AT_CPU;
 			own_cpu_data_ = true;
@@ -75,15 +76,15 @@ inline void SyncedMemory::to_cpu() {
 #ifndef CPU_ONLY
 			if (cpu_ptr_ == NULL) {
 				gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context,
-					CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
+						CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
 				cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
-					(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
-					size_, 0, NULL, NULL, NULL);
+						(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+						size_, 0, NULL, NULL, NULL);
 				own_cpu_data_ = true;
 			}
 			OCL_CHECK(
-				clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
-					(cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
+					clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
+							(cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
 			clFinish(amdDevice.CommandQueue);
 			head_ = SYNCED;
 #else
@@ -105,7 +106,7 @@ inline void SyncedMemory::to_gpu() {
 	switch (head_) {
 		case UNINITIALIZED: {
 			cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-				size_, NULL, NULL);
+					size_, NULL, NULL);
 			if (NULL == tmpMem) {
 				fprintf(stderr, "Failed to create memory object\n");
 				break;
@@ -118,15 +119,15 @@ inline void SyncedMemory::to_gpu() {
 		case HEAD_AT_CPU: {
 			if (gpu_ptr_ == NULL) {
 				cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-					size_, NULL, NULL);
+						size_, NULL, NULL);
 				if (NULL == tmpMem) {
 					fprintf(stderr, "Failed to create memory object\n");
 				}
 				gpu_ptr_ = (void*) tmpMem;
 			}
 			OCL_CHECK(
-				clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
-					(cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
+					clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+							(cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
 			clFinish(amdDevice.CommandQueue);
 			head_ = SYNCED;
 #ifdef Track_data_transfer
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 7d0a85aa..4c0ce04e 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -6,9 +6,10 @@
 namespace caffe {
 
 Timer::Timer()
-	: initted_(false),
-		running_(false),
-		has_run_at_least_once_(false) {
+:
+		initted_(false),
+				running_(false),
+				has_run_at_least_once_(false) {
 	Init();
 }
 
@@ -98,7 +99,7 @@ float CPUTimer::MilliSeconds() {
 		Stop();
 	}
 	this->elapsed_milliseconds_ = (this->stop_cpu_ -
-		this->start_cpu_).total_milliseconds();
+			this->start_cpu_).total_milliseconds();
 	return this->elapsed_milliseconds_;
 }
 
@@ -111,7 +112,7 @@ float CPUTimer::MicroSeconds() {
 		Stop();
 	}
 	this->elapsed_microseconds_ = (this->stop_cpu_ -
-		this->start_cpu_).total_microseconds();
+			this->start_cpu_).total_microseconds();
 	return this->elapsed_microseconds_;
 }
 
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index aec747af..d8adce8a 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -14,7 +14,7 @@ void LevelDB::Open(const string& source, Mode mode) {
 	options.create_if_missing = mode != READ;
 	leveldb::Status status = leveldb::DB::Open(options, source, &db_);
 	CHECK(status.ok()) << "Failed to open leveldb " << source
-		<< std::endl << status.ToString();
+			<< std::endl << status.ToString();
 	LOG(INFO) << "Opened leveldb " << source;
 }
 
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 69cc47bc..886ac85b 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -34,14 +34,14 @@
 
 namespace caffe {
 
-template<typename dtype> extern std::string get_dtype_suffix();
+template <typename dtype> extern std::string get_dtype_suffix();
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_col) {
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_col) {
 	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
 	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
 	int channels_col = channels * kernel_h * kernel_w;
@@ -55,7 +55,7 @@ void im2col_cpu(const Dtype* data_im, const int channels,
 				int w_pad = w * stride_w - pad_w + w_offset;
 				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
 					data_col[(c * height_col + h) * width_col + w] =
-						data_im[(c_im * height + h_pad) * width + w_pad];
+							data_im[(c_im * height + h_pad) * width + w_pad];
 				else
 					data_col[(c * height_col + h) * width_col + w] = 0;
 			}
@@ -64,20 +64,20 @@ void im2col_cpu(const Dtype* data_im, const int channels,
 }
 
 template void im2col_cpu<float>(const float* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, float* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, double* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, double* data_col);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_im) {
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_im) {
 	caffe_set(height * width * channels, Dtype(0), data_im);
 	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
 	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
@@ -92,26 +92,26 @@ void col2im_cpu(const Dtype* data_col, const int channels,
 				int w_pad = w * stride_w - pad_w + w_offset;
 				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
 					data_im[(c_im * height + h_pad) * width + w_pad] +=
-						data_col[(c * height_col + h) * width_col + w];
+							data_col[(c * height_col + h) * width_col + w];
 			}
 		}
 	}
 }
 
 template void col2im_cpu<float>(const float* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, float* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, double* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, double* data_im);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_im, const int img_offset, int optnum) {
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_im, const int img_offset, int optnum) {
 	std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	int height_col = (height + 2 * pad - ksize) / stride + 1;
@@ -138,26 +138,26 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu_opt<float>(const float* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, float* data_im, const int img_offset, int optnum);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, float* data_im, const int img_offset, int optnum);
 template void col2im_gpu_opt<double>(const double* data_col,
-	const int col_offset, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, double* data_im, const int img_offset, int optnum);
+		const int col_offset, const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, double* data_im, const int img_offset, int optnum);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_col, const int col_offset)
-	{
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_col, const int col_offset)
+		{
 	std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -186,30 +186,30 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 
 }
 
 template void im2col_gpu<float>(const float* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	float* data_col, const int col_offset);
+		const int channels,
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		float* data_col, const int col_offset);
 template void im2col_gpu<double>(const double* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	double* data_col, const int col_offset);
+		const int channels,
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		double* data_col, const int col_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_im, const int img_offset)
-	{
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_im, const int img_offset)
+		{
 	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -238,25 +238,26 @@ void col2im_gpu(const Dtype* data_col, const int col_offset,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w, float* data_im, const int img_offset);
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w, float* data_im,
+		const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	double* data_im, const int img_offset);
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		double* data_im, const int img_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, const int col_offset) {
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, const int col_offset) {
 
 	int height_col = (height + 2 * pad - ksize) / stride + 1;
 	int width_col = (width + 2 * pad - ksize) / stride + 1;
@@ -279,25 +280,25 @@ void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 	clFinish(amdDevice.CommandQueue);
 }
 
 template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im,
-	const int img_offset, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, float* data_col, const int col_offset);
+		const int img_offset, const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, float* data_col, const int col_offset);
 template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im,
-	const int img_offset, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, double* data_col, const int col_offset);
+		const int img_offset, const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, double* data_col, const int col_offset);
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_col, const int col_offset, int optnum) {
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_col, const int col_offset, int optnum) {
 
 	std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
@@ -326,23 +327,23 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void im2col_gpu_opt<float>(const float* data_im, const int img_offset,
-	const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, float* data_col, const int col_offset, int optnum);
+		const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, float* data_col, const int col_offset, int optnum);
 template void im2col_gpu_opt<double>(const double* data_im,
-	const int img_offset, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, double* data_col, const int col_offset, int optnum);
+		const int img_offset, const int channels,
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, double* data_col, const int col_offset, int optnum);
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
-	const int height, const int width, const int ksize, const int pad,
-	const int stride, Dtype* data_im, const int img_offset) {
+		const int height, const int width, const int ksize, const int pad,
+		const int stride, Dtype* data_im, const int img_offset) {
 	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -369,17 +370,17 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
 	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int psize, const int pad,
-	const int stride, float* data_im, const int img_offset);
+		const int channels,
+		const int height, const int width, const int psize, const int pad,
+		const int stride, float* data_im, const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-	const int channels,
-	const int height, const int width, const int psize, const int pad,
-	const int stride, double* data_im, const int img_offset);
+		const int channels,
+		const int height, const int width, const int psize, const int pad,
+		const int stride, double* data_im, const int img_offset);
 
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index be0ce3b4..6435427e 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -10,11 +10,11 @@ namespace caffe {
 
 template <typename Dtype>
 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	const int height_col, const int width_col,
-	Dtype* data_col) {
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int height_col, const int width_col,
+		Dtype* data_col) {
 	CUDA_KERNEL_LOOP(index, n) {
 		int w_out = index % width_col;
 		int h_index = index / width_col;
@@ -39,12 +39,12 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	Dtype* data_col) {
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		Dtype* data_col) {
 	// We are going to launch channels * height_col * width_col kernels, each
 	// kernel responsible for copying a single-channel grid.
 	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
@@ -53,30 +53,30 @@ void im2col_gpu(const Dtype* data_im, const int channels,
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
 	CAFFE_CUDA_NUM_THREADS>>>(
-		num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-		pad_w, stride_h, stride_w, height_col,
-		width_col, data_col);
+			num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+			pad_w, stride_h, stride_w, height_col,
+			width_col, data_col);
 	CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	float* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		float* data_col);
 template void im2col_gpu<double>(const double* data_im, const int channels,
-	const int height, const int width, const int kernel_h, const int kernel_w,
-	const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-	double* data_col);
+		const int height, const int width, const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+		double* data_col);
 
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
-	const int height, const int width, const int channels,
-	const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w,
-	const int stride_h, const int stride_w,
-	const int height_col, const int width_col,
-	Dtype* data_im) {
+		const int height, const int width, const int channels,
+		const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int height_col, const int width_col,
+		Dtype* data_im) {
 	CUDA_KERNEL_LOOP(index, n) {
 		Dtype val = 0;
 		int w = index % width + pad_w;
@@ -101,11 +101,11 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, Dtype* data_im) {
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, Dtype* data_im) {
 	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
 	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
 	int num_kernels = channels * height * width;
@@ -114,20 +114,20 @@ void col2im_gpu(const Dtype* data_col, const int channels,
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
 	CAFFE_CUDA_NUM_THREADS>>>(
-		num_kernels, data_col, height, width, channels, patch_h, patch_w,
-		pad_h, pad_w, stride_h, stride_w,
-		height_col, width_col, data_im);
+			num_kernels, data_col, height, width, channels, patch_h, patch_w,
+			pad_h, pad_w, stride_h, stride_w,
+			height_col, width_col, data_im);
 	CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, float* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
-	const int height, const int width, const int patch_h, const int patch_w,
-	const int pad_h, const int pad_w, const int stride_h,
-	const int stride_w, double* data_im);
+		const int height, const int width, const int patch_h, const int patch_w,
+		const int pad_h, const int pad_w, const int stride_h,
+		const int stride_w, double* data_im);
 
 }  // namespace caffe
diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp
index 2fbad3a9..299d1fd0 100644
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@@ -31,7 +31,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 		for (int j = 0; j < layer_param.bottom_size(); ++j) {
 			const string& blob_name = layer_param.bottom(j);
 			if (blob_name_to_last_top_idx.find(blob_name) ==
-				blob_name_to_last_top_idx.end()) {
+					blob_name_to_last_top_idx.end()) {
 				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
 			}
 			const pair<int, int>& bottom_idx = make_pair(i, j);
@@ -46,7 +46,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 		// A use of a top blob as a loss should be handled similarly to the use of
 		// a top blob as an input (bottom) blob to another layer.
 		const int last_loss =
-			std::min(layer_param.loss_weight_size(), layer_param.top_size());
+				std::min(layer_param.loss_weight_size(), layer_param.top_size());
 		for (int j = 0; j < last_loss; ++j) {
 			const string& blob_name = layer_param.top(j);
 			const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
@@ -66,7 +66,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 			LayerParameter* split_layer_param = param_split->add_layer();
 			const float kZeroLossWeight = 0;
 			ConfigureSplitLayer(layer_name, blob_name, i, split_count,
-				kZeroLossWeight, split_layer_param);
+					kZeroLossWeight, split_layer_param);
 		}
 	}
 	for (int i = 0; i < param.layer_size(); ++i) {
@@ -75,13 +75,13 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 		// Replace any shared bottom blobs with split layer outputs.
 		for (int j = 0; j < layer_param->bottom_size(); ++j) {
 			const pair<int, int>& top_idx =
-				bottom_idx_to_source_top_idx[make_pair(i, j)];
+					bottom_idx_to_source_top_idx[make_pair(i, j)];
 			const int split_count = top_idx_to_bottom_count[top_idx];
 			if (split_count > 1) {
 				const string& layer_name = layer_idx_to_layer_name[top_idx.first];
 				const string& blob_name = layer_param->bottom(j);
 				layer_param->set_bottom(j, SplitBlobName(layer_name,
-					blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
+						blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
 			}
 		}
 		// Create split layer for any top blobs used by other layer as bottom
@@ -95,7 +95,7 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 				LayerParameter* split_layer_param = param_split->add_layer();
 				const float loss_weight = top_idx_to_loss_weight[top_idx];
 				ConfigureSplitLayer(layer_name, blob_name, j, split_count,
-					loss_weight, split_layer_param);
+						loss_weight, split_layer_param);
 				if (loss_weight) {
 					layer_param->clear_loss_weight();
 					top_idx_to_bottom_split_idx[top_idx]++;
@@ -106,15 +106,15 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
 }
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-	const int blob_idx, const int split_count, const float loss_weight,
-	LayerParameter* split_layer_param) {
+		const int blob_idx, const int split_count, const float loss_weight,
+		LayerParameter* split_layer_param) {
 	split_layer_param->Clear();
 	split_layer_param->add_bottom(blob_name);
 	split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx));
 	split_layer_param->set_type("Split");
 	for (int k = 0; k < split_count; ++k) {
 		split_layer_param->add_top(
-			SplitBlobName(layer_name, blob_name, blob_idx, k));
+				SplitBlobName(layer_name, blob_name, blob_idx, k));
 		if (loss_weight) {
 			if (k == 0) {
 				split_layer_param->add_loss_weight(loss_weight);
@@ -126,18 +126,18 @@ void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
 }
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-	const int blob_idx) {
+		const int blob_idx) {
 	ostringstream split_layer_name;
 	split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
-		<< "_split";
+			<< "_split";
 	return split_layer_name.str();
 }
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-	const int blob_idx, const int split_idx) {
+		const int blob_idx, const int split_idx) {
 	ostringstream split_blob_name;
 	split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
-		<< "_split_" << split_idx;
+			<< "_split_" << split_idx;
 	return split_blob_name.str();
 }
 
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index c3be8a76..63dcf312 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -68,7 +68,7 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const int height, const int width, const bool is_color) {
+		const int height, const int width, const bool is_color) {
 	cv::Mat cv_img;
 	int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
 																	CV_LOAD_IMAGE_GRAYSCALE);
@@ -86,12 +86,12 @@ cv::Mat ReadImageToCVMat(const string& filename,
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const int height, const int width) {
+		const int height, const int width) {
 	return ReadImageToCVMat(filename, height, width, true);
 }
 
 cv::Mat ReadImageToCVMat(const string& filename,
-	const bool is_color) {
+		const bool is_color) {
 	return ReadImageToCVMat(filename, 0, 0, is_color);
 }
 
@@ -100,7 +100,7 @@ cv::Mat ReadImageToCVMat(const string& filename) {
 }
 // Do the file extension and encoding match?
 static bool matchExt(const std::string & fn,
-	std::string en) {
+		std::string en) {
 	size_t p = fn.rfind('.');
 	std::string ext = p != fn.npos ? fn.substr(p) : fn;
 	std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
@@ -112,18 +112,18 @@ static bool matchExt(const std::string & fn,
 	return false;
 }
 bool ReadImageToDatum(const string& filename, const int label,
-	const int height, const int width, const bool is_color,
-	const std::string & encoding, Datum* datum) {
+		const int height, const int width, const bool is_color,
+		const std::string & encoding, Datum* datum) {
 	cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
 	if (cv_img.data) {
 		if (encoding.size()) {
 			if ((cv_img.channels() == 3) == is_color && !height && !width &&
-				matchExt(filename, encoding))
+					matchExt(filename, encoding))
 				return ReadFileToDatum(filename, label, datum);
 			std::vector < uchar > buf;
 			cv::imencode("." + encoding, cv_img, buf);
 			datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
-				buf.size()));
+					buf.size()));
 			datum->set_label(label);
 			datum->set_encoded(true);
 			return true;
@@ -137,7 +137,7 @@ bool ReadImageToDatum(const string& filename, const int label,
 }
 
 bool ReadFileToDatum(const string& filename, const int label,
-	Datum* datum) {
+		Datum* datum) {
 	std::streampos size;
 
 	fstream file(filename.c_str(), ios::in | ios::binary | ios::ate);
@@ -229,13 +229,13 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
 }
 
 // Verifies format of data stored in HDF5 file and reshapes blob accordingly.
-template<typename Dtype>
+template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
-	hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-	Blob<Dtype>* blob) {
+		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+		Blob<Dtype>* blob) {
 	// Verify that the dataset exists.
 	CHECK(H5LTfind_dataset(file_id, dataset_name_))
-		<< "Failed to find HDF5 dataset " << dataset_name_;
+			<< "Failed to find HDF5 dataset " << dataset_name_;
 	// Verify that the number of dimensions is in the accepted range.
 	herr_t status;
 	int ndims;
@@ -248,7 +248,7 @@ void hdf5_load_nd_dataset_helper(
 	std::vector < hsize_t > dims(ndims);
 	H5T_class_t class_;
 	status = H5LTget_dataset_info(
-		file_id, dataset_name_, dims.data(), &class_, NULL);
+			file_id, dataset_name_, dims.data(), &class_, NULL);
 	CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
 	CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
 
@@ -259,47 +259,47 @@ void hdf5_load_nd_dataset_helper(
 	blob->Reshape(blob_dims);
 }
 
-template<>
+template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-	int min_dim, int max_dim, Blob<float>* blob) {
+		int min_dim, int max_dim, Blob<float>* blob) {
 	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
 	herr_t status = H5LTread_dataset_float(
-		file_id, dataset_name_, blob->mutable_cpu_data());
+			file_id, dataset_name_, blob->mutable_cpu_data());
 	CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
 }
 
-template<>
+template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-	int min_dim, int max_dim, Blob<double>* blob) {
+		int min_dim, int max_dim, Blob<double>* blob) {
 	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
 	herr_t status = H5LTread_dataset_double(
-		file_id, dataset_name_, blob->mutable_cpu_data());
+			file_id, dataset_name_, blob->mutable_cpu_data());
 	CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
 }
 
-template<>
+template <>
 void hdf5_save_nd_dataset<float>(
-	const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
+		const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
 	hsize_t dims[HDF5_NUM_DIMS];
 	dims[0] = blob.num();
 	dims[1] = blob.channels();
 	dims[2] = blob.height();
 	dims[3] = blob.width();
 	herr_t status = H5LTmake_dataset_float(
-		file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+			file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
 	CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
 }
 
-template<>
+template <>
 void hdf5_save_nd_dataset<double>(
-	const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
+		const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
 	hsize_t dims[HDF5_NUM_DIMS];
 	dims[0] = blob.num();
 	dims[1] = blob.channels();
 	dims[2] = blob.height();
 	dims[3] = blob.width();
 	herr_t status = H5LTmake_dataset_double(
-		file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+			file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
 	CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
 }
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 61162be6..4d2c9de6 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -40,265 +40,271 @@ static const clblasOrder order = clblasColumnMajor;
 
 namespace caffe {
 
-template<>
+template <>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const float alpha, const float* A, const float* B, const float beta,
-	float* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const float alpha, const float* A, const float* B, const float beta,
+		float* C) {
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-		ldb, beta, C, N);
+			ldb, beta, C, N);
 }
 
-template<>
+template <>
 void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const double alpha, const double* A, const double* B, const double beta,
-	double* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const double alpha, const double* A, const double* B, const double beta,
+		double* C) {
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-		ldb, beta, C, N);
+			ldb, beta, C, N);
 }
 
-template<>
+template <>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const float alpha, const float* A, const float* B, const float beta,
-	float* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const float alpha, const float* A, const float* B, const float beta,
+		float* C) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	//AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
 	CLBLAS_CHECK(
-		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0,
-			ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+					0,
+					ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const double alpha, const double* A, const double* B, const double beta,
-	double* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const double alpha, const double* A, const double* B, const double beta,
+		double* C) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	CLBLAS_CHECK(
-		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, 0,
-			ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+					0,
+					ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 cl_event caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const float alpha, const float* A, const int offA, const float* B,
-	const int offB, const float beta, float* C, const int offC) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const float alpha, const float* A, const int offA, const float* B,
+		const int offB, const float beta, float* C, const int offC) {
 	cl_event event;
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	CLBLAS_CHECK(
-		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
-			offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
+			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+					(cl_mem) C,
+					offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
 	return event;
 }
 
-template<>
+template <>
 cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const double alpha, const double* A, const int offA, const double* B,
-	const int offB, const double beta, double* C, const int offC) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const double alpha, const double* A, const int offA, const double* B,
+		const int offB, const double beta, double* C, const int offC) {
 	cl_event event;
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	CLBLAS_CHECK(
-		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
-			offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
+			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+					(cl_mem) C,
+					offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
 	return event;
 }
 
-template<>
+template <>
 cl_event caffe_gpu_gemm<float>(cl_command_queue *queue,
-	const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const float alpha, const float* A, const int offA, const float* B,
-	const int offB, const float beta, float* C, const int offC) {
+		const CBLAS_TRANSPOSE TransA,
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const float alpha, const float* A, const int offA, const float* B,
+		const int offB, const float beta, float* C, const int offC) {
 	cl_event event;
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	CLBLAS_CHECK(
-		clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
-			offC, ldc, 1, queue, 0, NULL, &event));
+			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+					(cl_mem) C,
+					offC, ldc, 1, queue, 0, NULL, &event));
 	return event;
 }
 
-template<>
+template <>
 cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
-	const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const double alpha, const double* A, const int offA, const double* B,
-	const int offB, const double beta, double* C, const int offC) {
+		const CBLAS_TRANSPOSE TransA,
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const double alpha, const double* A, const int offA, const double* B,
+		const int offB, const double beta, double* C, const int offC) {
 	cl_event event;
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	clblasTranspose transB =
-		(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	int ldc = N;
 	CLBLAS_CHECK(
-		clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-			(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, (cl_mem) C,
-			offC, ldc, 1, queue, 0, NULL, &event));
+			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+					(cl_mem) C,
+					offC, ldc, 1, queue, 0, NULL, &event));
 	return event;
 }
 
-template<>
+template <>
 void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const float alpha, const float* A, const float* x,
-	const float beta, float* y) {
+		const int N, const float alpha, const float* A, const float* x,
+		const float beta, float* y) {
 	cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-template<>
+template <>
 void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const double alpha, const double* A, const double* x,
-	const double beta, double* y) {
+		const int N, const double alpha, const double* A, const double* x,
+		const double beta, double* y) {
 	cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const float alpha, const float* A, size_t offA, int lda,
-	const float* x, size_t offx, const float beta, int incx,
-	float* y, size_t offy, int incy) {
+		const int N, const float alpha, const float* A, size_t offA, int lda,
+		const float* x, size_t offx, const float beta, int incx,
+		float* y, size_t offy, int incy) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
-		M, N, (cl_float) alpha, (cl_mem) A, offA, lda,
-		(cl_mem) x, offx, incx, (cl_float) beta,
-		(cl_mem) y, offy, incy,
-		1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+			M, N, (cl_float) alpha, (cl_mem) A, offA, lda,
+			(cl_mem) x, offx, incx, (cl_float) beta,
+			(cl_mem) y, offy, incy,
+			1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const double alpha, const double* A, size_t offA, int lda,
-	const double* x, size_t offx, const double beta, int incx,
-	double* y, size_t offy, int incy) {
+		const int N, const double alpha, const double* A, size_t offA, int lda,
+		const double* x, size_t offx, const double beta, int incx,
+		double* y, size_t offy, int incy) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	CLBLAS_CHECK(
-		clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
-			offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
-			incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
+					offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
+					incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const float alpha, const float* A, const float* x,
-	const float beta, float* y) {
+		const int N, const float alpha, const float* A, const float* x,
+		const float beta, float* y) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
-		M, N, (cl_float) alpha, (cl_mem) A, 0, N,
-		(cl_mem) x, 0, 1, (cl_float) beta,
-		(cl_mem) y, 0, 1,
-		1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+			M, N, (cl_float) alpha, (cl_mem) A, 0, N,
+			(cl_mem) x, 0, 1, (cl_float) beta,
+			(cl_mem) y, 0, 1,
+			1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const double alpha, const double* A, const double* x,
-	const double beta, double* y) {
+		const int N, const double alpha, const double* A, const double* x,
+		const double beta, double* y) {
 	clblasTranspose transA =
-		(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
 	CLBLAS_CHECK(
-		clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
-			N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
+					N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
+					&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_axpy<float>(const int N, const float alpha, const float* X,
-	float* Y) {
+		float* Y) {
 	cblas_saxpy(N, alpha, X, 1, Y, 1);
 }
 
-template<>
+template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
-	double* Y) {
+		double* Y) {
 	cblas_daxpy(N, alpha, X, 1, Y, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-	float* Y) {
+		float* Y) {
 	CLBLAS_CHECK(
-		clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+					&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-	double* Y) {
+		double* Y) {
 	CLBLAS_CHECK(
-		clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL));
+			clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+					&(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y)
-	{
+		{
 }
 
-template<>
+template <>
 void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y)
-	{
+		{
 }
 
-template<>
+template <>
 void caffe_gpu_abs<float>(const int n, const float* x, float* y)
-	{
+		{
 	caffe_gpu_abs_ocl(n, x, y);
 }
 
-template<>
+template <>
 void caffe_gpu_abs<double>(const int n, const double* x, double* y)
-	{
+		{
 	caffe_gpu_abs_ocl(n, x, y);
 }
 
-template<>
+template <>
 void caffe_set(const int N, const float alpha, float* Y) {
 	if (alpha == 0) {
 		memset(Y, 0, sizeof(float) * N);
@@ -309,7 +315,7 @@ void caffe_set(const int N, const float alpha, float* Y) {
 	}
 }
 
-template<>
+template <>
 void caffe_set(const int N, const double alpha, double* Y) {
 	if (alpha == 0) {
 		memset(Y, 0, sizeof(double) * N);
@@ -320,35 +326,35 @@ void caffe_set(const int N, const double alpha, double* Y) {
 	}
 }
 
-template<>
+template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
 	for (int i = 0; i < N; ++i) {
 		Y[i] += alpha;
 	}
 }
 
-template<>
+template <>
 void caffe_add_scalar(const int N, const double alpha, double* Y) {
 	for (int i = 0; i < N; ++i) {
 		Y[i] += alpha;
 	}
 }
 
-template<>
+template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
 	cblas_scopy(N, X, 1, Y, 1);
 }
 
-template<>
+template <>
 void caffe_copy<double>(const int N, const double* X, double* Y) {
 	cblas_dcopy(N, X, 1, Y, 1);
 }
 
 //template <typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
-	{
+		{
 	clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
-		NULL, NULL);
+			NULL, NULL);
 // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
 }
 /*
@@ -357,166 +363,168 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
  template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
  template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
  */
-template<>
+template <>
 void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y)
-	{
+		{
 	OCL_CHECK(
-		clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N,
-			0, NULL, NULL));
+			clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+					N,
+					0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y)
-	{
+		{
 	OCL_CHECK(
-		clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, N,
-			0, NULL, NULL));
+			clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+					N,
+					0, NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
 	if (X != Y) {
 		CLBLAS_CHECK(
-			clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-				&(amdDevice.CommandQueue), 0, NULL, NULL));
+				clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+						&(amdDevice.CommandQueue), 0, NULL, NULL));
 	}
 }
 
-template<>
+template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
 	if (X != Y) {
 		CLBLAS_CHECK(
-			clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-				&(amdDevice.CommandQueue), 0, NULL, NULL));
+				clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+						&(amdDevice.CommandQueue), 0, NULL, NULL));
 	}
 }
 
-template<>
+template <>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
 	cblas_sscal(N, alpha, X, 1);
 }
 
-template<>
+template <>
 void caffe_scal<double>(const int N, const double alpha, double *X) {
 	cblas_dscal(N, alpha, X, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
 	CLBLAS_CHECK(
-		clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-			NULL, NULL));
+			clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+					NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
 	CLBLAS_CHECK(
-		clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-			NULL, NULL));
+			clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+					NULL, NULL));
 }
 
-template<>
+template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-	const float beta, float* Y) {
+		const float beta, float* Y) {
 	caffe_gpu_scal<float>(N, beta, Y);
 	caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-	const double beta, double* Y) {
+		const double beta, double* Y) {
 	caffe_gpu_scal<double>(N, beta, Y);
 	caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
-template<>
+template <>
 void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
-	const float beta, float* Y) {
+		const float beta, float* Y) {
 	cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
-template<>
+template <>
 void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
-	const double beta, double* Y) {
+		const double beta, double* Y) {
 	cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
-template<>
+template <>
 void caffe_add<float>(const int n, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	vsAdd(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	vdAdd(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_sub<float>(const int n, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	vsSub(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	vdSub(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_mul<float>(const int n, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	vsMul(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	vdMul(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_div<float>(const int n, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	vsDiv(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	vdDiv(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_powx<float>(const int n, const float* a, const float b,
-	float* y) {
+		float* y) {
 	vsPowx(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
-	double* y) {
+		double* y) {
 	vdPowx(n, a, b, y);
 }
 
-template<>
+template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
 	vsSqr(n, a, y);
 }
 
-template<>
+template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
 	vdSqr(n, a, y);
 }
 
-template<>
+template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
 	vsExp(n, a, y);
 }
 
-template<>
+template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
 	vdExp(n, a, y);
 }
@@ -525,10 +533,10 @@ unsigned int caffe_rng_rand() {
 	return (*caffe_rng())();
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
 	return boost::math::nextafter < Dtype > (
-		b, std::numeric_limits < Dtype > ::max());
+			b, std::numeric_limits < Dtype > ::max());
 }
 
 template
@@ -537,13 +545,13 @@ float caffe_nextafter(const float b);
 template
 double caffe_nextafter(const double b);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
 	CHECK_GE(n, 0);
 	CHECK(r);
 	CHECK_LE(a, b);
 	boost::uniform_real < Dtype
-		> random_distribution(a, caffe_nextafter<Dtype>(b));
+			> random_distribution(a, caffe_nextafter<Dtype>(b));
 	boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
 	variate_generator(caffe_rng(), random_distribution);
 	for (int i = 0; i < n; ++i) {
@@ -555,15 +563,15 @@ void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
 
 template
 void caffe_rng_uniform<float>(const int n, const float a, const float b,
-	float* r);
+		float* r);
 
 template
 void caffe_rng_uniform<double>(const int n, const double a, const double b,
-	double* r);
+		double* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype a,
-	const Dtype sigma, Dtype* r) {
+		const Dtype sigma, Dtype* r) {
 	CHECK_GE(n, 0);
 	CHECK(r);
 	CHECK_GT(sigma, 0);
@@ -579,13 +587,13 @@ void caffe_rng_gaussian(const int n, const Dtype a,
 
 template
 void caffe_rng_gaussian<float>(const int n, const float mu,
-	const float sigma, float* r);
+		const float sigma, float* r);
 
 template
 void caffe_rng_gaussian<double>(const int n, const double mu,
-	const double sigma, double* r);
+		const double sigma, double* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
 	CHECK_GE(n, 0);
 	CHECK(r);
@@ -605,7 +613,7 @@ void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
 	CHECK_GE(n, 0);
 	CHECK(r);
@@ -625,104 +633,104 @@ void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
 template
 void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
 //
-template<>
+template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
 	return cblas_sdot(n, x, 1, y, 1);
 }
 
-template<>
+template <>
 double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
 	return cblas_ddot(n, x, 1, y, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-	float* out) {
+		float* out) {
 	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(n * sizeof(float)), NULL, NULL);
+			(n * sizeof(float)), NULL, NULL);
 	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(1 * sizeof(float)), NULL, NULL);
+			(1 * sizeof(float)), NULL, NULL);
 	clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
-		&(amdDevice.CommandQueue), 0, NULL, NULL);
+			&(amdDevice.CommandQueue), 0, NULL, NULL);
 	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
-		out, 0, NULL, NULL);
+			out, 0, NULL, NULL);
 	clReleaseMemObject(scratchBuff);
 	clReleaseMemObject(d_out);
 }
 
-template<>
+template <>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-	double * out) {
+		double * out) {
 	//need to pass in scratchBuff
 	//AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(n * sizeof(double)), NULL, NULL);
+			(n * sizeof(double)), NULL, NULL);
 	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(1 * sizeof(double)), NULL, NULL);
+			(1 * sizeof(double)), NULL, NULL);
 	clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
-		&(amdDevice.CommandQueue), 0, NULL, NULL);
+			&(amdDevice.CommandQueue), 0, NULL, NULL);
 	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
-		out, 0, NULL, NULL);
+			out, 0, NULL, NULL);
 	clReleaseMemObject(scratchBuff);
 	clReleaseMemObject(d_out);
 }
 
-template<>
+template <>
 int caffe_cpu_hamming_distance<float>(const int n, const float* x,
-	const float* y) {
+		const float* y) {
 	int dist = 0;
 	for (int i = 0; i < n; ++i) {
 		dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
-			static_cast<uint32_t>(y[i]));
+				static_cast<uint32_t>(y[i]));
 	}
 	return dist;
 }
 
-template<>
+template <>
 int caffe_cpu_hamming_distance<double>(const int n, const double* x,
-	const double* y) {
+		const double* y) {
 	int dist = 0;
 	for (int i = 0; i < n; ++i) {
 		dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
-			static_cast<uint64_t>(y[i]));
+				static_cast<uint64_t>(y[i]));
 	}
 	return dist;
 }
 
-template<>
+template <>
 float caffe_cpu_asum<float>(const int n, const float* x) {
 	return cblas_sasum(n, x, 1);
 }
 
-template<>
+template <>
 double caffe_cpu_asum<double>(const int n, const double* x) {
 	return cblas_dasum(n, x, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
 	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(n * sizeof(cl_float)), NULL, NULL);
+			(n * sizeof(cl_float)), NULL, NULL);
 	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(1 * sizeof(cl_float)), NULL, NULL);
+			(1 * sizeof(cl_float)), NULL, NULL);
 	clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
-		&(amdDevice.CommandQueue), 0, NULL, NULL);
+			&(amdDevice.CommandQueue), 0, NULL, NULL);
 	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
-		0, NULL, NULL);
+			0, NULL, NULL);
 	clReleaseMemObject(scratchBuff);
 	clReleaseMemObject(d_y);
 }
 
-template<>
+template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(n * sizeof(cl_double)), NULL, NULL);
+			(n * sizeof(cl_double)), NULL, NULL);
 	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-		(1 * sizeof(cl_double)), NULL, NULL);
+			(1 * sizeof(cl_double)), NULL, NULL);
 	clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
-		&(amdDevice.CommandQueue), 0, NULL, NULL);
+			&(amdDevice.CommandQueue), 0, NULL, NULL);
 	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
-		y, 0, NULL, NULL);
+			y, 0, NULL, NULL);
 	clReleaseMemObject(scratchBuff);
 	clReleaseMemObject(d_y);
 }
@@ -735,195 +743,195 @@ INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit);
 INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
 
-template<>
+template <>
 void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
-	float* y) {
+		float* y) {
 	cblas_scopy(n, x, 1, y, 1);
 	cblas_sscal(n, alpha, y, 1);
 }
 
-template<>
+template <>
 void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
-	double* y) {
+		double* y) {
 	cblas_dcopy(n, x, 1, y, 1);
 	cblas_dscal(n, alpha, y, 1);
 }
 
-template<>
+template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-	float* y) {
+		float* y) {
 	caffe_gpu_copy(n, x, y);
 	caffe_gpu_scal(n, alpha, y);
 }
 
-template<>
+template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-	double* y) {
+		double* y) {
 	caffe_gpu_copy(n, x, y);
 	caffe_gpu_scal(n, alpha, y);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
-template<>
+template <>
 void caffe_gpu_set<float>(const int N, const float alpha, float* Y) {
 	ocl_memset(Y, alpha, N);
 }
 
-template<>
+template <>
 void caffe_gpu_set<double>(const int N, const double alpha, double* Y) {
 	ocl_memset(Y, alpha, N);
 }
 
-template<>
+template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 	kernel_add_scalar(N, alpha, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
 	kernel_add_scalar(N, alpha, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
 	kernel_exp(N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
 	kernel_exp(N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_sign<float>(const int N, const float *X, float *Y) {
 	caffe_gpu_sign_ocl(N, X, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_sign<double>(const int N, const double *X, double *Y) {
 	caffe_gpu_sign_ocl(N, X, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_sub(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_sub(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_mul<float>(const int N, const float* a,
-	const float* b, float* y) {
+		const float* b, float* y) {
 	kernel_mul(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_mul<double>(const int N, const double* a,
-	const double* b, double* y) {
+		const double* b, double* y) {
 	kernel_mul(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_div<float>(const int N, const float* a,
-	const float* b, float* y) {
+		const float* b, float* y) {
 	kernel_div(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_div<double>(const int N, const double* a,
-	const double* b, double* y) {
+		const double* b, double* y) {
 	kernel_div(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_powx<float>(const int N, const float* a,
-	const float alpha, float* y) {
+		const float alpha, float* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_powx(N, a, alpha, y);
 }
 
-template<>
+template <>
 void caffe_gpu_powx<double>(const int N, const double* a,
-	const double alpha, double* y) {
+		const double alpha, double* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_powx(N, a, alpha, y);
 }
 
 void popc_kernel(const int n, const float* a,
-	const float* b, uint8_t* y) {
+		const float* b, uint8_t* y) {
 }
 
 void popcll_kernel(const int n, const double* a,
-	const double* b, uint8_t* y) {
+		const double* b, uint8_t* y) {
 }
 
-template<>
+template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
-	const float* y) {
+		const float* y) {
 	return 0;
 }
 
-template<>
+template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
-	const double* y) {
+		const double* y) {
 	return 0;
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
 }
 
-template<>
+template <>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-	float* r) {
+		float* r) {
 }
-template<>
+template <>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-	double* r) {
+		double* r) {
 }
 
-template<>
+template <>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
-	float* r) {
+		float* r) {
 }
 
-template<>
+template <>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
-	double* r) {
+		double* r) {
 }
 
-template<>
+template <>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_log(N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_log(N, a, y);
 }
 
-template<>
+template <>
 void caffe_log<float>(const int n, const float* a, float* y) {
 	vsLn(n, a, y);
 }
 
-template<>
+template <>
 void caffe_log<double>(const int n, const double* a, double* y) {
 	vdLn(n, a, y);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
 	if (X != Y) {
 		if (Caffe::mode() == Caffe::GPU) {
@@ -941,47 +949,47 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
 
 template void caffe_copy<int>(const int N, const int* X, int* Y);
 template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-	unsigned int* Y);
+		unsigned int* Y);
 template void caffe_copy<float>(const int N, const float* X, float* Y);
 template void caffe_copy<double>(const int N, const double* X, double* Y);
 
-template<>
+template <>
 void caffe_abs<float>(const int n, const float* a, float* y) {
 	vsAbs(n, a, y);
 }
 
-template<>
+template <>
 void caffe_abs<double>(const int n, const double* a, double* y) {
 	vdAbs(n, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-	float* y) {
+		float* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_add(N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-	double* y) {
+		double* y) {
 	// NOLINT_NEXT_LINE(whitespace/operators)
 	kernel_add(N, a, b, y);
 }
 
-template<>
+template <>
 float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-	const float* y, const int incy) {
+		const float* y, const int incy) {
 	return cblas_sdot(n, x, incx, y, incy);
 }
 
-template<>
+template <>
 double caffe_cpu_strided_dot<double>(const int n, const double* x,
-	const int incx, const double* y, const int incy) {
+		const int incx, const double* y, const int incy) {
 	return cblas_ddot(n, x, incx, y, incy);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
 	if (alpha == 0) {
 		memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 1bf783e4..64245bea 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -12,67 +12,67 @@
 
 namespace caffe {
 
-template<>
+template <>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const float alpha, const float* A, const float* B, const float beta,
-	float* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const float alpha, const float* A, const float* B, const float beta,
+		float* C) {
 	// Note that cublas follows fortran order.
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cublasOperation_t cuTransA =
-		(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+			(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	cublasOperation_t cuTransB =
-		(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+			(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-		N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+			N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
-template<>
+template <>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-	const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-	const double alpha, const double* A, const double* B, const double beta,
-	double* C) {
+		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+		const double alpha, const double* A, const double* B, const double beta,
+		double* C) {
 	// Note that cublas follows fortran order.
 	int lda = (TransA == CblasNoTrans) ? K : M;
 	int ldb = (TransB == CblasNoTrans) ? N : K;
 	cublasOperation_t cuTransA =
-		(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+			(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	cublasOperation_t cuTransB =
-		(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+			(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 	CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-		N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+			N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const float alpha, const float* A, const float* x,
-	const float beta, float* y) {
+		const int N, const float alpha, const float* A, const float* x,
+		const float beta, float* y) {
 	cublasOperation_t cuTransA =
-		(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+			(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
 	CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-		A, N, x, 1, &beta, y, 1));
+			A, N, x, 1, &beta, y, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-	const int N, const double alpha, const double* A, const double* x,
-	const double beta, double* y) {
+		const int N, const double alpha, const double* A, const double* x,
+		const double beta, double* y) {
 	cublasOperation_t cuTransA =
-		(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+			(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
 	CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-		A, N, x, 1, &beta, y, 1));
+			A, N, x, 1, &beta, y, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-	float* Y) {
+		float* Y) {
 	CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-	double* Y) {
+		double* Y) {
 	CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
@@ -82,62 +82,62 @@ void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) {
 	}
 }
 
-template<>
+template <>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
 	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
 	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-	const float beta, float* Y) {
+		const float beta, float* Y) {
 	caffe_gpu_scal<float>(N, beta, Y);
 	caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-	const double beta, double* Y) {
+		const double beta, double* Y) {
 	caffe_gpu_scal<double>(N, beta, Y);
 	caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-	float* out) {
+		float* out) {
 	CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
-template<>
+template <>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-	double * out) {
+		double * out) {
 	CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
-template<>
+template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
 	CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
-template<>
+template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 	CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
-template<>
+template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-	float* y) {
+		float* y) {
 	CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
 	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
-template<>
+template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-	double* y) {
+		double* y) {
 	CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
 	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
@@ -149,7 +149,7 @@ __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 	}
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
 	if (alpha == 0) {
 		CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
@@ -157,7 +157,7 @@ void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
 	}
 	// NOLINT_NEXT_LINE(whitespace/operators)
 set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-	N, alpha, Y);
+		N, alpha, Y);
 }
 
 template void caffe_gpu_set<int>(const int N, const int alpha, int* Y);
@@ -171,14 +171,14 @@ CUDA_KERNEL_LOOP(index, n) {
 }
 }
 
-template<>
+template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 // NOLINT_NEXT_LINE(whitespace/operators)
 add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-N, alpha, Y);
+	N, alpha, Y);
 }
 
-template<>
+template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
@@ -193,7 +193,7 @@ y[index] = a[index] + b[index];
 }
 }
 
-template<>
+template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
 float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -201,7 +201,7 @@ add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
 double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -217,7 +217,7 @@ y[index] = a[index] - b[index];
 }
 }
 
-template<>
+template <>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
 float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -225,7 +225,7 @@ sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
 double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -241,7 +241,7 @@ y[index] = a[index] * b[index];
 }
 }
 
-template<>
+template <>
 void caffe_gpu_mul<float>(const int N, const float* a,
 const float* b, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -249,7 +249,7 @@ mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_mul<double>(const int N, const double* a,
 const double* b, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -265,7 +265,7 @@ y[index] = a[index] / b[index];
 }
 }
 
-template<>
+template <>
 void caffe_gpu_div<float>(const int N, const float* a,
 const float* b, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -273,7 +273,7 @@ div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
-template<>
+template <>
 void caffe_gpu_div<double>(const int N, const double* a,
 const double* b, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -288,14 +288,14 @@ y[index] = abs(a[index]);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_abs<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_abs<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
@@ -309,14 +309,14 @@ y[index] = exp(a[index]);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
@@ -330,14 +330,14 @@ y[index] = log(a[index]);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, y);
 }
 
-template<>
+template <>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
@@ -352,7 +352,7 @@ y[index] = pow(a[index], alpha);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_powx<float>(const int N, const float* a,
 const float alpha, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -360,7 +360,7 @@ powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, alpha, y);
 }
 
-template<>
+template <>
 void caffe_gpu_powx<double>(const int N, const double* a,
 const double alpha, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -390,28 +390,28 @@ static_cast<uint64_t>(b[index]));
 }
 }
 
-template<>
+template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
 const float* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
 	// TestHammingDistanceGPU in test_math_functions.cpp).
 NOT_IMPLEMENTED;
 thrust::device_vector < uint8_t > popcounts(n);
-  // NOLINT_NEXT_LINE(whitespace/operators)
+	// NOLINT_NEXT_LINE(whitespace/operators)
 popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
 n, x, y, thrust::raw_pointer_cast(popcounts.data()));
 return thrust::reduce(popcounts.begin(), popcounts.end(),
 (uint32_t) 0, thrust::plus<uint32_t>());
 }
 
-template<>
+template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
 const double* y) {
-  // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
+	// TODO: Fix caffe_gpu_hamming_distance (see failing unit test
 	// TestHammingDistanceGPU in test_math_functions.cpp).
 NOT_IMPLEMENTED;
 thrust::device_vector < uint8_t > popcounts(n);
-  // NOLINT_NEXT_LINE(whitespace/operators)
+	// NOLINT_NEXT_LINE(whitespace/operators)
 popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
 n, x, y, thrust::raw_pointer_cast(popcounts.data()));
 return thrust::reduce(popcounts.begin(), popcounts.end(),
@@ -423,7 +423,7 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
 CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
 }
 
-template<>
+template <>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
 float* r) {
 CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
@@ -436,7 +436,7 @@ caffe_gpu_add_scalar(n, a, r);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
 double* r) {
 CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
@@ -449,14 +449,14 @@ caffe_gpu_add_scalar(n, a, r);
 }
 }
 
-template<>
+template <>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
 float* r) {
 CURAND_CHECK(
 curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
-template<>
+template <>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
 double* r) {
 CURAND_CHECK(
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 6b8c5fee..8f44a106 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -32,9 +32,9 @@
 #include "caffe/common.hpp"
 #include "caffe/util/ocl_util.hpp"
 namespace caffe {
-template<typename dtype> extern std::string get_dtype_suffix();
+template <typename dtype> extern std::string get_dtype_suffix();
 
-template<typename Dtype>
+template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
 	std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
@@ -47,19 +47,19 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
 template void ocl_memset<int>(int* buffer, const int value, const int count);
 template void ocl_memset<float>(float* buffer, const float value,
-	const int count);
+		const int count);
 template void ocl_memset<double>(double* buffer, const double value,
-	const int count);
+		const int count);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
-	const int count) {
+		const int count) {
 	cl_int err;
 	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
 	err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
@@ -69,8 +69,8 @@ void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
@@ -79,11 +79,11 @@ void eventCallback(cl_event event, cl_int event_status, void* user_data) {
 	cl_ulong ev_end_time = (cl_ulong) 0;
 	double run_time;
 	OCL_CHECK(
-		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
-			sizeof(cl_ulong), &ev_start_time, NULL));
+			clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+					sizeof(cl_ulong), &ev_start_time, NULL));
 	OCL_CHECK(
-		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
-			&ev_end_time, NULL));
+			clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+					&ev_end_time, NULL));
 	run_time = (double) (ev_end_time - ev_start_time);
 	printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
 }
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index be0c5894..8eb1a981 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -37,10 +37,9 @@ typedef unsigned int uint32_t;
 struct array4x32 {
 		uint32_t v[4];
 };
-template<typename Dtype>
+template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
-	Dtype threshold)
-	{
+		Dtype threshold) {
 	std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
 	cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
 
@@ -63,17 +62,18 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
 	size_t globalws[1] = { size };
 	size_t localws[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws,
-			localws, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+					globalws,
+					localws, 0, NULL, NULL));
 }
 template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n,
-	float inf, float sup, float threshold);
+		float inf, float sup, float threshold);
 template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n,
-	double inf, double sup, double threshold);
+		double inf, double sup, double threshold);
 
-template<typename Dtype>
+template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
-	const int M_, const int packing_num) {
+		const int M_, const int packing_num) {
 	std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -89,18 +89,18 @@ void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
 	size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) };
 	size_t uiLocal_Work_Size2[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
 }
 
 template void transform_gpu<float>(float* src, float* dst, const int top_offset,
-	const int N_, const int M_, const int packing_num);
+		const int N_, const int M_, const int packing_num);
 template void transform_gpu<double>(double* src, double* dst,
-	const int top_offset, const int N_, const int M_, const int packing_num);
+		const int top_offset, const int N_, const int M_, const int packing_num);
 
-template<typename Dtype>
+template <typename Dtype>
 void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* bottom_data, Dtype* scale_data) {
+		const Dtype* bottom_data, Dtype* scale_data) {
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data));
@@ -109,16 +109,16 @@ void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
 	size_t Global_Work_Size[1] = { (size_t) num };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim,
-	const float* bottom_data, float* scale_data);
+		const float* bottom_data, float* scale_data);
 template void get_max_gpu<double>(cl_kernel Kernel, const int num,
-	const int dim, const double* bottom_data, double* scale_data);
+		const int dim, const double* bottom_data, double* scale_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) {
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
@@ -127,18 +127,18 @@ void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) {
 	size_t Global_Work_Size[1] = { (size_t) num };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data,
-	float* out);
+		float* out);
 template void exp_gpu<double>(cl_kernel Kernel, const int num,
-	const double* data, double* out);
+		const double* data, double* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* scale, Dtype* data) {
+		const Dtype* scale, Dtype* data) {
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale));
@@ -147,18 +147,18 @@ void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
 	size_t Global_Work_Size[1] = { (size_t)(num * dim) };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void softmax_div_gpu<float>(cl_kernel Kernel, const int num,
-	const int dim, const float* scale, float* data);
+		const int dim, const float* scale, float* data);
 template void softmax_div_gpu<double>(cl_kernel Kernel, const int num,
-	const int dim, const double* scale, double* data);
+		const int dim, const double* scale, double* data);
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
-	const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
+		const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
 
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss));
@@ -170,26 +170,25 @@ Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
 	size_t globalws[1] = { 256 };
 	size_t localws[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
-			localws, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
+					localws, 0, NULL, NULL));
 	void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE,
-		CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
+			CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
 	Dtype loss = *(Dtype*) h_loss;
 	clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL,
-		NULL);
+			NULL);
 
 	return loss;
 }
 
 template float softmax_gpu<float>(cl_kernel Kernel, const int num,
-	const int dim, const float* prob_data, const float* label, cl_mem d_loss);
+		const int dim, const float* prob_data, const float* label, cl_mem d_loss);
 template double softmax_gpu<double>(cl_kernel Kernel, const int num,
-	const int dim, const double* prob_data, const double* label, cl_mem d_loss);
+		const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-	const int spatial_dim, const Dtype* data, Dtype* out)
-	{
+		const int spatial_dim, const Dtype* data, Dtype* out) {
 	std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -202,22 +201,21 @@ void kernel_channel_max(const int num, const int channels,
 	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_max<float>(const int num, const int channels,
-	const int spatial_dim, const float* data, float* out);
+		const int spatial_dim, const float* data, float* out);
 template void kernel_channel_max<double>(const int num, const int channels,
-	const int spatial_dim, const double* data, double* out);
+		const int spatial_dim, const double* data, double* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_subtract(const int count,
-	const int num, const int channels,
-	const int spatial_dim, const Dtype* channel_max, Dtype* data)
-	{
+		const int num, const int channels,
+		const int spatial_dim, const Dtype* channel_max, Dtype* data) {
 	std::string kernel_name = "kernel_channel_subtract"
-		+ get_dtype_suffix<Dtype>();
+			+ get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
@@ -230,20 +228,20 @@ void kernel_channel_subtract(const int count,
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_subtract<float>(const int count,
-	const int num, const int channels,
-	const int spatial_dim, const float* channel_max, float* data);
+		const int num, const int channels,
+		const int spatial_dim, const float* channel_max, float* data);
 template void kernel_channel_subtract<double>(const int count,
-	const int num, const int channels,
-	const int spatial_dim, const double* channel_max, double* data);
+		const int num, const int channels,
+		const int spatial_dim, const double* channel_max, double* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-	{
+		{
 	std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -255,18 +253,17 @@ void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_mul<float>(const int count, const float* a, const float* b,
-	float* out);
+		float* out);
 template void kernel_mul<double>(const int count, const double* a,
-	const double* b, double* out);
+		const double* b, double* out);
 
-template<typename Dtype>
-void kernel_add_scalar(const int count, const Dtype data, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_add_scalar(const int count, const Dtype data, Dtype* out) {
 	std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -277,19 +274,18 @@ void kernel_add_scalar(const int count, const Dtype data, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_add_scalar<float>(const int count, const float data,
-	float* out);
+		float* out);
 template void kernel_add_scalar<double>(const int count, const double data,
-	double* out);
+		double* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
-	Dtype* out)
-	{
+		Dtype* out) {
 	std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -301,18 +297,17 @@ void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_powx<float>(const int count, const float* data,
-	const float alpha, float* out);
+		const float alpha, float* out);
 template void kernel_powx<double>(const int count, const double* data,
-	const double alpha, double* out);
+		const double alpha, double* out);
 
-template<typename Dtype>
-void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
 	std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -324,18 +319,17 @@ void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_div<float>(const int count, const float* a, const float* b,
-	float* out);
+		float* out);
 template void kernel_div<double>(const int count, const double* a,
-	const double* b, double* out);
+		const double* b, double* out);
 
-template<typename Dtype>
-void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
 	std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -347,18 +341,17 @@ void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_add<float>(const int count, const float* a, const float* b,
-	float* out);
+		float* out);
 template void kernel_add<double>(const int count, const double* a,
-	const double* b, double* out);
+		const double* b, double* out);
 
-template<typename Dtype>
-void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
 	std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -370,18 +363,17 @@ void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_sub<float>(const int count, const float* a, const float* b,
-	float* out);
+		float* out);
 template void kernel_sub<double>(const int count, const double* a,
-	const double* b, double* out);
+		const double* b, double* out);
 
-template<typename Dtype>
-void kernel_log(const int count, const Dtype* data, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_log(const int count, const Dtype* data, Dtype* out) {
 	std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -392,17 +384,16 @@ void kernel_log(const int count, const Dtype* data, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_log<float>(const int count, const float* data, float* out);
 template void kernel_log<double>(const int count, const double* data,
-	double* out);
+		double* out);
 
-template<typename Dtype>
-void kernel_exp(const int count, const Dtype* data, Dtype* out)
-	{
+template <typename Dtype>
+void kernel_exp(const int count, const Dtype* data, Dtype* out) {
 	std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -413,18 +404,17 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out)
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_exp<float>(const int count, const float* data, float* out);
 template void kernel_exp<double>(const int count, const double* data,
-	double* out);
+		double* out);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-	const int spatial_dim, const Dtype* data, Dtype* channel_sum)
-	{
+		const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
 	std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -437,19 +427,18 @@ void kernel_channel_sum(const int num, const int channels,
 	size_t Global_Work_Size[1] = { (size_t)(num * channels) };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_sum<float>(const int num, const int channels,
-	const int spatial_dim, const float* data, float* channel_sum);
+		const int spatial_dim, const float* data, float* channel_sum);
 template void kernel_channel_sum<double>(const int num, const int channels,
-	const int spatial_dim, const double* data, double* channel_sum);
+		const int spatial_dim, const double* data, double* channel_sum);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
-	const int spatial_dim, const Dtype* channel_sum, Dtype* data)
-	{
+		const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
 	std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -463,22 +452,21 @@ void kernel_channel_div(const int count, const int num, const int channels,
 	size_t Global_Work_Size[1] = { (size_t) count };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_div<float>(const int count, const int num,
-	const int channels,
-	const int spatial_dim, const float* channel_sum, float* data);
+		const int channels,
+		const int spatial_dim, const float* channel_sum, float* data);
 template void kernel_channel_div<double>(const int count, const int num,
-	const int channels,
-	const int spatial_dim, const double* channel_sum, double* data);
+		const int channels,
+		const int spatial_dim, const double* channel_sum, double* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-	const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-	Dtype* channel_dot)
-	{
+		const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+		Dtype* channel_dot) {
 	std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -492,24 +480,23 @@ void kernel_channel_dot(const int num, const int channels,
 	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_dot<float>(const int num, const int channels,
-	const int spatial_dim, const float* data_1, const float* data_2,
-	float* channel_dot);
+		const int spatial_dim, const float* data_1, const float* data_2,
+		float* channel_dot);
 template void kernel_channel_dot<double>(const int num, const int channels,
-	const int spatial_dim, const double* data_1, const double* data_2,
-	double* channel_dot);
+		const int spatial_dim, const double* data_1, const double* data_2,
+		double* channel_dot);
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLossForwardGPU(const int nthreads,
-	const Dtype* prob_data, const Dtype* label, Dtype* loss,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_,
-	Dtype* counts)
-	{
+		const Dtype* prob_data, const Dtype* label, Dtype* loss,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_,
+		Dtype* counts) {
 	std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -521,34 +508,33 @@ void SoftmaxLossForwardGPU(const int nthreads,
 	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
 	OCL_CHECK(
-		clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+			clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
 
 	size_t Global_Work_Size[1] = { (size_t) nthreads };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SoftmaxLossForwardGPU<float>(const int nthreads,
-	const float* prob_data, const float* label, float* loss,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_, float* counts);
+		const float* prob_data, const float* label, float* loss,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_, float* counts);
 template void SoftmaxLossForwardGPU<double>(const int nthreads,
-	const double* prob_data, const double* label, double* loss,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_, double* counts);
+		const double* prob_data, const double* label, double* loss,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_, double* counts);
 
-template<typename Dtype>
+template <typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-	const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-	const int spatial_dim, const bool has_ignore_label_,
-	const int ignore_label_, Dtype* counts)
-	{
+		const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+		const int spatial_dim, const bool has_ignore_label_,
+		const int ignore_label_, Dtype* counts) {
 	std::string kernel_name = "SoftmaxLossBackwardGPU"
-		+ get_dtype_suffix<Dtype>();
+			+ get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
@@ -559,27 +545,27 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
 	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
 	OCL_CHECK(
-		clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+			clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
 
 	size_t Global_Work_Size[1] = { (size_t) nthreads };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SoftmaxLossBackwardGPU<float>(const int nthreads,
-	const float* top, const float* label, float* bottom_diff,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_, float* counts);
+		const float* top, const float* label, float* bottom_diff,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_, float* counts);
 template void SoftmaxLossBackwardGPU<double>(const int nthreads,
-	const double* top, const double* label, double* bottom_diff,
-	const int num, const int dim, const int spatial_dim,
-	const bool has_ignore_label_, const int ignore_label_, double* counts);
+		const double* top, const double* label, double* bottom_diff,
+		const int num, const int dim, const int spatial_dim,
+		const bool has_ignore_label_, const int ignore_label_, double* counts);
 
-template<typename Dtype>
+template <typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) {
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha));
@@ -588,18 +574,18 @@ void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) {
 	size_t Global_Work_Size[1] = { (size_t) num };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void scal_gpu<float>(cl_kernel Kernel, const int num,
-	const float alpha, float* data);
+		const float alpha, float* data);
 template void scal_gpu<double>(cl_kernel Kernel, const int num,
-	const double alpha, double* data);
+		const double alpha, double* data);
 
-template<typename Dtype>
+template <typename Dtype>
 void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data,
-	const Dtype* label) {
+		const Dtype* label) {
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
@@ -608,21 +594,21 @@ void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data,
 	size_t Global_Work_Size[1] = { (size_t) num };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim,
-	float* data, const float* label);
+		float* data, const float* label);
 template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim,
-	double* data, const double* label);
+		double* data, const double* label);
 
-template<typename Dtype>
+template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	Dtype* top_data) {
+		const Dtype* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		Dtype* top_data) {
 	cl_int ret;
 	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
@@ -640,28 +626,28 @@ void max_pool_fp_gpu(cl_kernel Kernel, const int count,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
-	const float* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	float* top_data);
+		const float* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		float* top_data);
 template void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
-	const double* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	double* top_data);
+		const double* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
-	const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
-	Dtype* top_mask) {
+		const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+		Dtype* top_mask) {
 	std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -688,30 +674,29 @@ void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void MaxPoolForward<float>(const int count, const float* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, float* top_data, int* mask,
-	float* top_mask);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, float* top_data, int* mask,
+		float* top_mask);
 template void MaxPoolForward<double>(const int count, const double* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, double* top_data, int* mask,
-	double* top_mask);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, double* top_data, int* mask,
+		double* top_mask);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	Dtype* idx_data, Dtype* top_data)
-	{
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		Dtype* idx_data, Dtype* top_data) {
 	std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -734,26 +719,27 @@ void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void StoPoolForwardTrain<float>(const int count,
-	const float* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_h_, const int kernel_w_,
-	const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+		const float* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_h_, const int kernel_w_,
+		const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
 template void StoPoolForwardTrain<double>(const int count,
-	const double* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_h_, const int kernel_w_,
-	const int stride_h_, const int stride_w_, double* idx_data, double* top_data);
+		const double* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_h_, const int kernel_w_,
+		const int stride_h_, const int stride_w_, double* idx_data,
+		double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolForwardTest(const int count, const Dtype* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	Dtype* top_data) {
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		Dtype* top_data) {
 	std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -776,27 +762,27 @@ void StoPoolForwardTest(const int count, const Dtype* bottom_data,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 template void StoPoolForwardTest<float>(const int count,
-	const float* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_h_, const int kernel_w_,
-	const int stride_h_, const int stride_w_, float* top_data);
+		const float* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_h_, const int kernel_w_,
+		const int stride_h_, const int stride_w_, float* top_data);
 template void StoPoolForwardTest<double>(const int count,
-	const double* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_h_, const int kernel_w_,
-	const int stride_h_, const int stride_w_, double* top_data);
+		const double* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_h_, const int kernel_w_,
+		const int stride_h_, const int stride_w_, double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
-	const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, Dtype* top_data) {
+		const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, Dtype* top_data) {
 	std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -820,26 +806,26 @@ void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void AvePoolForward<float>(const int count, const float* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, float* top_data);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, float* top_data);
 template void AvePoolForward<double>(const int count, const double* bottom_data,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_h_,
-	const int kernel_w_, const int stride_h_, const int stride_w_,
-	const int pad_h_, const int pad_w_, double* top_data);
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_h_,
+		const int kernel_w_, const int stride_h_, const int stride_w_,
+		const int pad_h_, const int pad_w_, double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, Dtype* top_data) {
+		const Dtype* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, Dtype* top_data) {
 	cl_int ret;
 	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
@@ -858,27 +844,27 @@ void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
-	const float* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, float* top_data);
+		const float* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, float* top_data);
 template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
-	const double* bottom_data, const int clnum, const int channels_,
-	const int height_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, double* top_data);
+		const double* bottom_data, const int clnum, const int channels_,
+		const int height_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count,
-	const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, Dtype* bottom_diff) {
+		const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, Dtype* bottom_diff) {
 	cl_int ret;
 	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
@@ -898,28 +884,28 @@ void max_pool_bp_gpu(cl_kernel Kernel, const int count,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
-	const float* bottom_data, const float* top_data, const float* top_diff,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, float* bottom_diff);
+		const float* bottom_data, const float* top_data, const float* top_diff,
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, float* bottom_diff);
 template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
-	const double* bottom_data, const double* top_data, const double* top_diff,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, double* bottom_diff);
+		const double* bottom_data, const double* top_data, const double* top_diff,
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-	const int* const mask, const Dtype* const top_mask, const int num,
-	const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-	const int pad_w, Dtype* const bottom_diff) {
+		const int* const mask, const Dtype* const top_mask, const int num,
+		const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+		const int pad_w, Dtype* const bottom_diff) {
 	std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -945,32 +931,31 @@ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
 	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void MaxPoolBackward<float>(const int nthreads,
-	const float* const top_diff, const int* const mask,
-	const float* const top_mask, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-	float* const bottom_diff);
+		const float* const top_diff, const int* const mask,
+		const float* const top_mask, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+		float* const bottom_diff);
 template void MaxPoolBackward<double>(const int nthreads,
-	const double* const top_diff, const int* const mask,
-	const double* const top_mask, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-	double* const bottom_diff);
-
-template<typename Dtype>
+		const double* const top_diff, const int* const mask,
+		const double* const top_mask, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+		double* const bottom_diff);
+
+template <typename Dtype>
 void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-	const int num, const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-	const int pad_w, Dtype* const bottom_diff)
-	{
+		const int num, const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+		const int pad_w, Dtype* const bottom_diff) {
 	std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -995,28 +980,28 @@ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
 	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void AvePoolBackward<float>(const int nthreads,
-	const float* const top_diff, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-	float* const bottom_diff);
+		const float* const top_diff, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+		float* const bottom_diff);
 template void AvePoolBackward<double>(const int nthreads,
-	const double* const top_diff, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-	double* const bottom_diff);
+		const double* const top_diff, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+		double* const bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
-	const Dtype* const top_diff, const int num, const int channels,
-	const int height, const int width, const int pooled_height,
-	const int pooled_width, const int kernel_h, const int kernel_w,
-	const int stride_h, const int stride_w, Dtype* const bottom_diff) {
+		const Dtype* const top_diff, const int num, const int channels,
+		const int height, const int width, const int pooled_height,
+		const int pooled_width, const int kernel_h, const int kernel_w,
+		const int stride_h, const int stride_w, Dtype* const bottom_diff) {
 	std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1039,27 +1024,27 @@ void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
 	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void StoPoolBackward<float>(const int nthreads,
-	const float* const rand_idx, const float* const top_diff, const int num,
-	const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w,
-	float* const bottom_diff);
+		const float* const rand_idx, const float* const top_diff, const int num,
+		const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w,
+		float* const bottom_diff);
 template void StoPoolBackward<double>(const int nthreads,
-	const double* const rand_idx, const double* const top_diff, const int num,
-	const int channels, const int height, const int width,
-	const int pooled_height, const int pooled_width, const int kernel_h,
-	const int kernel_w, const int stride_h, const int stride_w,
-	double* const bottom_diff);
+		const double* const rand_idx, const double* const top_diff, const int num,
+		const int channels, const int height, const int width,
+		const int pooled_height, const int pooled_width, const int kernel_h,
+		const int kernel_w, const int stride_h, const int stride_w,
+		double* const bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
-	const int clnum, const int channels_, const int height_, const int width_,
-	const int pooled_height_, const int pooled_width_, const int kernel_size_,
-	const int stride_, const int pad_, Dtype* bottom_diff) {
+		const int clnum, const int channels_, const int height_, const int width_,
+		const int pooled_height_, const int pooled_width_, const int kernel_size_,
+		const int stride_, const int pad_, Dtype* bottom_diff) {
 	cl_int ret;
 	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
@@ -1078,25 +1063,25 @@ void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
-	const float* top_diff, const int clnum, const int channels_,
-	const int intheight_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, float* bottom_diff);
+		const float* top_diff, const int clnum, const int channels_,
+		const int intheight_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, float* bottom_diff);
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
-	const double* top_diff, const int clnum, const int channels_,
-	const int intheight_, const int width_, const int pooled_height_,
-	const int pooled_width_, const int kernel_size_, const int stride_,
-	const int pad_, double* bottom_diff);
+		const double* top_diff, const int clnum, const int channels_,
+		const int intheight_, const int width_, const int pooled_height_,
+		const int pooled_width_, const int kernel_size_, const int stride_,
+		const int pad_, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUForward(const int count, const int channels, const int dim,
-	const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
-	const int div_factor) {
+		const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+		const int div_factor) {
 	std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1110,20 +1095,20 @@ void PReLUForward(const int count, const int channels, const int dim,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUForward<float>(const int count, const int channels,
-	const int dim, const float* bottom_data, float* top_data,
-	const float* slope_data, const int div_factor);
+		const int dim, const float* bottom_data, float* top_data,
+		const float* slope_data, const int div_factor);
 template void PReLUForward<double>(const int count, const int channels,
-	const int dim, const double* bottom_data, double* top_data,
-	const double* slope_data, const int div_factor);
+		const int dim, const double* bottom_data, double* top_data,
+		const double* slope_data, const int div_factor);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUBackward(const int count, const int channels, const int dim,
-	const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
-	const Dtype* slope_data, const int div_factor) {
+		const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+		const Dtype* slope_data, const int div_factor) {
 	std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1138,20 +1123,20 @@ void PReLUBackward(const int count, const int channels, const int dim,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUBackward<float>(const int count, const int channels,
-	const int dim, const float* top_diff, const float* bottom_data,
-	float* bottom_diff, const float* slope_data, const int div_factor);
+		const int dim, const float* top_diff, const float* bottom_data,
+		float* bottom_diff, const float* slope_data, const int div_factor);
 template void PReLUBackward<double>(const int count, const int channels,
-	const int dim, const double* top_diff, const double* bottom_data,
-	double* bottom_diff, const double* slope_data, const int div_factor);
+		const int dim, const double* top_diff, const double* bottom_data,
+		double* bottom_diff, const double* slope_data, const int div_factor);
 
-template<typename Dtype>
+template <typename Dtype>
 void PReLUParamBackward(const int count, const Dtype* top_diff,
-	const int offset_out, const Dtype* bottom_data, const int offset_in,
-	Dtype* bottom_diff) {
+		const int offset_out, const Dtype* bottom_data, const int offset_in,
+		Dtype* bottom_diff) {
 	std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1164,19 +1149,19 @@ void PReLUParamBackward(const int count, const Dtype* top_diff,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUParamBackward<float>(const int count, const float* top_diff,
-	const int offset_out, const float* bottom_data, const int offset_in,
-	float* bottom_diff);
+		const int offset_out, const float* bottom_data, const int offset_in,
+		float* bottom_diff);
 template void PReLUParamBackward<double>(const int count,
-	const double* top_diff, const int offset_out, const double* bottom_data,
-	const int offset_in, double* bottom_diff);
+		const double* top_diff, const int offset_out, const double* bottom_data,
+		const int offset_in, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
-	Dtype negative_slope) {
+		Dtype negative_slope) {
 	std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1188,18 +1173,18 @@ void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void ReLUForward<float>(const int count, const float* bottom_data,
-	float* top_data, float negative_slope);
+		float* top_data, float negative_slope);
 template void ReLUForward<double>(const int count, const double* bottom_data,
-	double* top_data, double negative_slope);
+		double* top_data, double negative_slope);
 
-template<typename Dtype>
+template <typename Dtype>
 void ReLUBackward(const int count, const Dtype* top_diff,
-	const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
+		const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
 	std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1214,17 +1199,17 @@ void ReLUBackward(const int count, const Dtype* top_diff,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void ReLUBackward<float>(const int count, const float* top_diff,
-	const float* bottom_data, float* bottom_diff, float negative_slope);
+		const float* bottom_data, float* bottom_diff, float negative_slope);
 template void ReLUBackward<double>(const int count, const double* top_diff,
-	const double* bottom_data, double* bottom_diff, double negative_slope);
+		const double* bottom_data, double* bottom_diff, double negative_slope);
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidForward(const int count, const Dtype* bottom_data,
-	Dtype* top_data) {
+		Dtype* top_data) {
 	std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1235,18 +1220,18 @@ void SigmoidForward(const int count, const Dtype* bottom_data,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SigmoidForward<float>(const int count, const float* bottom_data,
-	float* top_data);
+		float* top_data);
 template void SigmoidForward<double>(const int count, const double* bottom_data,
-	double* top_data);
+		double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void SigmoidBackward(const int count, const Dtype* top_diff,
-	const Dtype* top_data, Dtype* bottom_diff) {
+		const Dtype* top_data, Dtype* bottom_diff) {
 	std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1260,17 +1245,17 @@ void SigmoidBackward(const int count, const Dtype* top_diff,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void SigmoidBackward<float>(const int count, const float* top_diff,
-	const float* top_data, float* bottom_diff);
+		const float* top_data, float* bottom_diff);
 template void SigmoidBackward<double>(const int count, const double* top_diff,
-	const double* top_data, double* bottom_diff);
+		const double* top_data, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ThresholdForward(const int count, const Dtype threshold,
-	const Dtype* bottom_data, Dtype* top_data) {
+		const Dtype* bottom_data, Dtype* top_data) {
 	std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 	cl_int ret;
@@ -1282,16 +1267,16 @@ void ThresholdForward(const int count, const Dtype threshold,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void ThresholdForward<float>(const int count, const float threshold,
-	const float* bottom_data, float* top_data);
+		const float* bottom_data, float* top_data);
 template void ThresholdForward<double>(const int count, const double threshold,
-	const double* bottom_data, double* top_data);
+		const double* bottom_data, double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) {
 	std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
@@ -1303,18 +1288,18 @@ void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) {
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void TanHForward<float>(const int count, const float* bottom_data,
-	float* top_data);
+		float* top_data);
 template void TanHForward<double>(const int count, const double* bottom_data,
-	double* top_data);
+		double* top_data);
 
-template<typename Dtype>
+template <typename Dtype>
 void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
-	Dtype* bottom_diff) {
+		Dtype* bottom_diff) {
 	std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1328,381 +1313,441 @@ void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
 	size_t uiGlobal_Work_Size[] = { (size_t) count };
 	size_t uiLocal_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-			uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void TanHBackward<float>(const int count, const float* top_diff,
-	const float* top_data, float* bottom_diff);
+		const float* top_data, float* bottom_diff);
 template void TanHBackward<double>(const int count, const double* top_diff,
-	const double* top_data, double* bottom_diff);
+		const double* top_data, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-    const int height, const int width, Dtype* data_opt, const int opt_offset, const int optnum) {
-    std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+		const int height, const int width, Dtype* data_opt, const int opt_offset,
+		const int optnum) {
+	std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-    int num_kernels = channels * height * width * optnum;
+	int num_kernels = channels * height * width * optnum;
 
-    cl_int ret;
-    ret=clSetKernelArg(Kernel,0,sizeof(cl_int),(void*)&num_kernels);
-    ret|=clSetKernelArg(Kernel,1,sizeof(cl_mem),(void*)&data_im);
-    ret|=clSetKernelArg(Kernel,2,sizeof(cl_int),(void*)&im_offset);
-    ret|=clSetKernelArg(Kernel,3,sizeof(cl_int),(void*)&height);
-    ret|=clSetKernelArg(Kernel,4,sizeof(cl_int),(void*)&width);
-    ret|=clSetKernelArg(Kernel,5,sizeof(cl_int),(void*)&channels);
-    ret|=clSetKernelArg(Kernel,6,sizeof(cl_mem),(void*)&data_opt);
-    ret|=clSetKernelArg(Kernel,7,sizeof(cl_int),(void*)&opt_offset);
-    ret|=clSetKernelArg(Kernel,8,sizeof(cl_int),(void*)&optnum);
-    OCL_CHECK(ret);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
+	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
+	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
+	OCL_CHECK(ret);
 
-    size_t uiGlobal_Work_Size[] = {(size_t)num_kernels};
-    size_t uiLocal_Work_Size[] = {256};
-    OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL) );
+	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
-template void opttrans<float>(const float* data_im, const int im_offset, const int channels,
-    const int height, const int width, float* data_opt, const int opt_offset, const int optnum);
-template void opttrans<double>(const double* data_im, const int im_offset, const int channels,
-    const int height, const int width, double* data_opt, const int opt_offset, const int optnum);
+template void opttrans<float>(const float* data_im, const int im_offset,
+		const int channels,
+		const int height, const int width, float* data_opt, const int opt_offset,
+		const int optnum);
+template void opttrans<double>(const double* data_im, const int im_offset,
+		const int channels,
+		const int height, const int width, double* data_opt, const int opt_offset,
+		const int optnum);
 
 template <typename Dtype>
 void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale){
-  std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
-  cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
-  cl_int ret;
-  ret=clSetKernelArg(LFSkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LFSkernel,1,sizeof(cl_mem),(void*)&in);
-  ret|=clSetKernelArg(LFSkernel,2,sizeof(cl_int),(void*)&num);
-  ret|=clSetKernelArg(LFSkernel,3,sizeof(cl_int),(void*)&channels);
-  ret|=clSetKernelArg(LFSkernel,4,sizeof(cl_int),(void*)&height);
-  ret|=clSetKernelArg(LFSkernel,5,sizeof(cl_int),(void*)&width);
-  ret|=clSetKernelArg(LFSkernel,6,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LFSkernel,7,sizeof(Dtype),(void*)&alpha_over_size);
-  ret|=clSetKernelArg(LFSkernel,8,sizeof(Dtype),(void*)&k);
-  ret|=clSetKernelArg(LFSkernel,9,sizeof(cl_mem),(void*)&scale);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL, NULL) );
+		const int num, const int channels, const int height,
+		const int width, const int size, const Dtype alpha_over_size,
+		const Dtype k, Dtype* const scale) {
+	std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
+	cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
+	ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
+	ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
+	ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
+	ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void LRNFillScale<float>(const int nthreads, const float* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const float alpha_over_size,
-    const float k, float* const scale);
+		const int num, const int channels, const int height,
+		const int width, const int size, const float alpha_over_size,
+		const float k, float* const scale);
 template void LRNFillScale<double>(const int nthreads, const double* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const double alpha_over_size,
-    const double k, double* const scale);
+		const int num, const int channels, const int height,
+		const int width, const int size, const double alpha_over_size,
+		const double k, double* const scale);
 
 template <typename Dtype>
 void LRNComputeOutput(int nthreads, const Dtype* in,
-     Dtype* scale, Dtype negative_beta, Dtype* out){
-  std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
-  cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
-  cl_int ret;
-  ret=clSetKernelArg(LCOkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LCOkernel,1,sizeof(cl_mem),(void*)&in);
-  ret|=clSetKernelArg(LCOkernel,2,sizeof(cl_mem),(void*)&scale);
-  ret|=clSetKernelArg(LCOkernel,3,sizeof(Dtype),(void*)&negative_beta);
-  ret|=clSetKernelArg(LCOkernel,4,sizeof(cl_mem),(void*)&out);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size2[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size2[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,uiGlobal_Work_Size2,uiLocal_Work_Size2,0,NULL,NULL) );
+		Dtype* scale, Dtype negative_beta, Dtype* out) {
+	std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
+	cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
+	ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
+	ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
+	ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size2[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
+					uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
 }
 template void LRNComputeOutput<float>(int nthreads, const float* in,
-    float* scale, float negative_beta, float* out);
+		float* scale, float negative_beta, float* out);
 template void LRNComputeOutput<double>(int nthreads, const double* in,
-    double* scale, double negative_beta, double* out);
+		double* scale, double negative_beta, double* out);
 
 template <typename Dtype>
 void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff){
-  std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
-  cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
-  cl_int ret;
-  ret=clSetKernelArg(LCDkernel,0,sizeof(cl_int),(void*)&nthreads);
-  ret|=clSetKernelArg(LCDkernel,1,sizeof(cl_mem),(void*)&bottom_data);
-  ret|=clSetKernelArg(LCDkernel,2,sizeof(cl_mem),(void*)&top_data);
-  ret|=clSetKernelArg(LCDkernel,3,sizeof(cl_mem),(void*)&scale);
-  ret|=clSetKernelArg(LCDkernel,4,sizeof(cl_mem),(void*)&top_diff);
-  ret|=clSetKernelArg(LCDkernel,5,sizeof(cl_int),(void*)&num);
-  ret|=clSetKernelArg(LCDkernel,6,sizeof(cl_int),(void*)&channels);
-  ret|=clSetKernelArg(LCDkernel,7,sizeof(cl_int),(void*)&height);
-  ret|=clSetKernelArg(LCDkernel,8,sizeof(cl_int),(void*)&width);
-  ret|=clSetKernelArg(LCDkernel,9,sizeof(cl_int),(void*)&size);
-  ret|=clSetKernelArg(LCDkernel,10,sizeof(Dtype),(void*)&negative_beta);
-  ret|=clSetKernelArg(LCDkernel,11,sizeof(Dtype),(void*)&cache_ratio);
-  ret|=clSetKernelArg(LCDkernel,12,sizeof(cl_mem),(void*)&bottom_diff);
-  OCL_CHECK(ret);
-  size_t uiGlobal_Work_Size[]={(size_t)nthreads};
-  size_t uiLocal_Work_Size[]={256};
-  OCL_CHECK( clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,uiGlobal_Work_Size,uiLocal_Work_Size,0,NULL,NULL) );
+		const Dtype* const bottom_data, const Dtype* const top_data,
+		const Dtype* const scale, const Dtype* const top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int size, const Dtype negative_beta,
+		const Dtype cache_ratio, Dtype* const bottom_diff) {
+	std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
+	cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
+	ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
+	ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
+	ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
+	ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
+	ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
+	ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
+	ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
+	ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+	size_t uiLocal_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
+					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void LRNComputeDiff<float>(const int nthreads,
-    const float* const bottom_data, const float* const top_data,
-    const float* const scale, const float* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const float negative_beta,
-    const float cache_ratio, float* const bottom_diff);
+		const float* const bottom_data, const float* const top_data,
+		const float* const scale, const float* const top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int size, const float negative_beta,
+		const float cache_ratio, float* const bottom_diff);
 template void LRNComputeDiff<double>(const int nthreads,
-    const double* const bottom_data, const double* const top_data,
-    const double* const scale, const double* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const double negative_beta,
-    const double cache_ratio, double* const bottom_diff);
-
-template <typename Dtype>
-void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y){
-    std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&in1);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&in2);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add<float> (const int n, const float* in1, const float* in2, float* y);
-template void caffe_gpu_add<double> (const int n, const double* in1, const double* in2, double* y);
-
-template <typename Dtype>
-void caffe_gpu_sign_ocl(const int N,  const Dtype* X, Dtype * Y ){
-    std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)N};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_sign_ocl<float>(const int N,  const float* X, float* Y );
-template void caffe_gpu_sign_ocl<double>(const int N,  const double* X, double* Y );
-
-template <typename Dtype>
-void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y ){
-    std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&N);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&X);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&Y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)N};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_abs_ocl<float>(const int N,  const float* X, float* Y );
-template void caffe_gpu_abs_ocl<double>(const int N,  const double* X, double* Y );
-
-template <typename Dtype>
-void caffe_gpu_div (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-    std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_div<float> (const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_div<double> (const int n, const double* a, const double* b, double* y);
-
-template <typename Dtype>
-void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data){
-     std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&top_data);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_add_scalar<float> (const int n, const float alpha, float* top_data);
-template void caffe_gpu_add_scalar<double> (const int n, const double alpha, double* top_data);
-
-template <typename Dtype>
-void caffe_gpu_mul (const int n, const Dtype* a, const Dtype* b, Dtype* y){
-        std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*)&b);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_mul<float> (const int n, const float* a, const float* b, float* y);
-template void caffe_gpu_mul<double> (const int n, const double* a, const double* b, double* y);
-
-template <typename Dtype>
-void caffe_gpu_powx (const int n, const Dtype* a, const Dtype alpha, Dtype* y){
-       std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
-    cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-    cl_int ret;
-    ret  = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*)&n);
-    ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*)&a);
-    ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*)&alpha);
-    ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*)&y);
-    OCL_CHECK(ret);
-    size_t Global_Work_Size[] = {(size_t)n};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void caffe_gpu_powx<float> (const int n, const float* a, const float alpha, float* y);
-template void caffe_gpu_powx<double> (const int n, const double* a, const double alpha, double* y);
-
-template <typename Dtype>
-void DropoutForward(const int count, const Dtype* bottom_data, const int* MaskMem, const Dtype scale_, Dtype* top_data)
-{
-    std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret=clSetKernelArg(kernel,0,sizeof(cl_int),(void*)&count);
-    ret|=clSetKernelArg(kernel,1,sizeof(cl_mem),(void*)&bottom_data);
-    ret|=clSetKernelArg(kernel,2,sizeof(cl_mem),(void*)&MaskMem);
-    ret|=clSetKernelArg(kernel,3,sizeof(Dtype),(void*)&scale_);
-    ret|=clSetKernelArg(kernel,4,sizeof(cl_mem),(void*)&top_data);
-    OCL_CHECK(ret);
-
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
-}
-
-template void DropoutForward<float>(const int count, const float* bottom_data, const int* MaskMem, const float scale_, float* top_data);
-template void DropoutForward<double>(const int count, const double* bottom_data, const int* MaskMem, const double scale_, double* top_data);
+		const double* const bottom_data, const double* const top_data,
+		const double* const scale, const double* const top_diff,
+		const int num, const int channels, const int height,
+		const int width, const int size, const double negative_beta,
+		const double cache_ratio, double* const bottom_diff);
 
 template <typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem, const float threshold_, const Dtype scale_, Dtype* bottom_diff)
-{
-    std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
+	std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
 
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&MaskMem);
-    ret |= clSetKernelArg(kernel,3,sizeof(cl_int),  (void*)&threshold_);
-    ret |= clSetKernelArg(kernel,4,sizeof(Dtype),(void*)&scale_);
-    ret |= clSetKernelArg(kernel,5,sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
+template void caffe_gpu_add<float>(const int n, const float* in1,
+		const float* in2, float* y);
+template void caffe_gpu_add<double>(const int n, const double* in1,
+		const double* in2, double* y);
 
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+template <typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
+	std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) N };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void DropoutBackward<float>(const int count, const float* top_diff, const int* MaskMem, const float threshold_, const float scale_, float* bottom_diff);
-template void DropoutBackward<double>(const int count, const double* top_diff, const int* MaskMem, const float threshold_, const double scale_, double* bottom_diff);
 
+template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
+		double* Y);
 
 template <typename Dtype>
-void  BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data)
-{
-    std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&bottom_data);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&top_data);
-    OCL_CHECK(ret);
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
+	std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) N };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_abs_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_abs_ocl<double>(const int N, const double* X,
+		double* Y);
 
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+template <typename Dtype>
+void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+	std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void  BNLLForward<float>(const int count, const float* bottom_data, float *top_data);
-template void  BNLLForward<double>(const int count, const double* bottom_data, double *top_data);
 
-template <typename Dtype>
-void  BNLLBackward(const int count, const Dtype* top_diff, const Dtype* bottom_data, Dtype *bottom_diff)
-{
-    std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-    cl_int ret;
-    ret = clSetKernelArg(kernel, 0,sizeof(cl_int),  (void*)&count);
-    ret |= clSetKernelArg(kernel,1,sizeof(cl_mem),  (void*)&top_diff);
-    ret |= clSetKernelArg(kernel,2,sizeof(cl_mem),  (void*)&bottom_data);
-    ret |= clSetKernelArg(kernel,3,sizeof(cl_mem),  (void*)&bottom_diff);
-    OCL_CHECK(ret);
+template void caffe_gpu_div<float>(const int n, const float* a, const float* b,
+		float* y);
+template void caffe_gpu_div<double>(const int n, const double* a,
+		const double* b, double* y);
 
-    size_t Global_Work_Size[] = {(size_t)count};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) {
+	std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add_scalar<float>(const int n, const float alpha,
+		float* top_data);
+template void caffe_gpu_add_scalar<double>(const int n, const double alpha,
+		double* top_data);
+
+template <typename Dtype>
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+	std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float>(const int n, const float* a, const float* b,
+		float* y);
+template void caffe_gpu_mul<double>(const int n, const double* a,
+		const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) {
+	std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
+	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+	cl_int ret;
+	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+	ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
+	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+	OCL_CHECK(ret);
+	size_t Global_Work_Size[] = { (size_t) n };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float>(const int n, const float* a,
+		const float alpha, float* y);
+template void caffe_gpu_powx<double>(const int n, const double* a,
+		const double alpha, double* y);
+
+template <typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data,
+		const int* MaskMem, const Dtype scale_, Dtype* top_data) {
+	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void DropoutForward<float>(const int count, const float* bottom_data,
+		const int* MaskMem, const float scale_, float* top_data);
+template void DropoutForward<double>(const int count, const double* bottom_data,
+		const int* MaskMem, const double scale_, double* top_data);
+
+template <typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
+		const float threshold_, const Dtype scale_, Dtype* bottom_diff) {
+	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void  BNLLBackward<float>(const int count, const float* top_diff, const float* bottom_data, float *bottom_diff);
-template void  BNLLBackward<double>(const int count, const double* top_diff, const double* bottom_data, double *bottom_diff);
-
-
-template <typename Dtype>
-void  Concat(const int nthreads, const Dtype* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, Dtype *out_data)
-{
-    std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
-    cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-    int k_forward = (forward == true)? 1 : 0;
-    cl_int ret;
-    ret = clSetKernelArg(kernel,  0, sizeof(cl_int),  (void*)&nthreads);
-    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void*)&in_data);
-    ret |= clSetKernelArg(kernel, 2, sizeof(cl_int),  (void*)&k_forward);
-    ret |= clSetKernelArg(kernel, 3, sizeof(cl_int),  (void*)&num_concats);
-    ret |= clSetKernelArg(kernel, 4, sizeof(cl_int),  (void*)&concat_size);
-    ret |= clSetKernelArg(kernel, 5, sizeof(cl_int),  (void*)&top_concat_axis);
-    ret |= clSetKernelArg(kernel, 6, sizeof(cl_int),  (void*)&bottom_concat_axis); 
-    ret |= clSetKernelArg(kernel, 7, sizeof(cl_int),  (void*)&offset_concat_axis);
-    ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem),  (void*)&out_data);
-    OCL_CHECK(ret);
+template void DropoutBackward<float>(const int count, const float* top_diff,
+		const int* MaskMem, const float threshold_, const float scale_,
+		float* bottom_diff);
+template void DropoutBackward<double>(const int count, const double* top_diff,
+		const int* MaskMem, const float threshold_, const double scale_,
+		double* bottom_diff);
+
+template <typename Dtype>
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) {
+	std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
-    size_t Global_Work_Size[] = {(size_t)nthreads};
-    size_t Local_Work_Size[] = {256};
-    OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
-template void  Concat<float>(const int nthreads, const float* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, float *out_data);
-template void  Concat<double>(const int nthreads, const double* in_data, const bool forward, const int num_concats, const int  concat_size,
-        const int top_concat_axis, const int bottom_concat_axis, const int offset_concat_axis, double *out_data);
+template void BNLLForward<float>(const int count, const float* bottom_data,
+		float *top_data);
+template void BNLLForward<double>(const int count, const double* bottom_data,
+		double *top_data);
+
+template <typename Dtype>
+void BNLLBackward(const int count, const Dtype* top_diff,
+		const Dtype* bottom_data, Dtype *bottom_diff) {
+	std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void BNLLBackward<float>(const int count, const float* top_diff,
+		const float* bottom_data, float *bottom_diff);
+template void BNLLBackward<double>(const int count, const double* top_diff,
+		const double* bottom_data, double *bottom_diff);
+
+template <typename Dtype>
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+		const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, Dtype *out_data) {
+	std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+	int k_forward = (forward == true) ? 1 : 0;
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
+	ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
+	ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
+	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) nthreads };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Concat<float>(const int nthreads, const float* in_data,
+		const bool forward, const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, float *out_data);
+template void Concat<double>(const int nthreads, const double* in_data,
+		const bool forward, const int num_concats, const int concat_size,
+		const int top_concat_axis, const int bottom_concat_axis,
+		const int offset_concat_axis, double *out_data);
 
 template <typename Dtype>
 void CLLBackward(const int count, const int channels,
-	const Dtype margin, const bool legacy_version, const Dtype alpha,
-	const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-	Dtype *bottom_diff)
-	{
+		const Dtype margin, const bool legacy_version, const Dtype alpha,
+		const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
+		Dtype *bottom_diff) {
 	std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1721,23 +1766,22 @@ void CLLBackward(const int count, const int channels,
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void CLLBackward<float>(const int count, const int channels,
-	const float margin, const bool legacy_version, const float alpha,
-	const float* y, const float* diff, const float* dist_sq,
-	float *bottom_diff);
+		const float margin, const bool legacy_version, const float alpha,
+		const float* y, const float* diff, const float* dist_sq,
+		float *bottom_diff);
 template void CLLBackward<double>(const int count, const int channels,
-	const double margin, const bool legacy_version, const double alpha,
-	const double* y, const double* diff, const double* dist_sq,
-	double *bottom_diff);
+		const double margin, const bool legacy_version, const double alpha,
+		const double* y, const double* diff, const double* dist_sq,
+		double *bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-	const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-	int* mask)
-	{
+		const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+		int* mask) {
 	std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
 	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1753,21 +1797,20 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
 	size_t Global_Work_Size[] = { (size_t) nthreads };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void MaxForward<float>(const int nthreads, const float* bottom_data_a,
-	const float* bottom_data_b, const int blob_idx, float* top_data,
-	int* mask);
+		const float* bottom_data_b, const int blob_idx, float* top_data,
+		int* mask);
 template void MaxForward<double>(const int nthreads,
-	const double* bottom_data_a,
-	const double* bottom_data_b, const int blob_idx, double* top_data,
-	int* mask);
+		const double* bottom_data_a,
+		const double* bottom_data_b, const int blob_idx, double* top_data,
+		int* mask);
 
-template<typename Dtype>
+template <typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff,
-	const int blob_idx, const int* mask, Dtype* bottom_diff)
-	{
+		const int blob_idx, const int* mask, Dtype* bottom_diff) {
 	std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1782,27 +1825,27 @@ void MaxBackward(const int nthreads, const Dtype* top_diff,
 	size_t Global_Work_Size[] = { (size_t) nthreads };
 	size_t Local_Work_Size[] = { 256 };
 	OCL_CHECK(
-		clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-			Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void MaxBackward<float>(const int nthreads, const float* top_diff,
-	const int blob_idx, const int* mask, float* bottom_diff);
+		const int blob_idx, const int* mask, float* bottom_diff);
 template void MaxBackward<double>(const int nthreads, const double* top_diff,
-	const int blob_idx, const int* mask, double* bottom_diff);
+		const int blob_idx, const int* mask, double* bottom_diff);
 
-template<typename Dtype>
+template <typename Dtype>
 void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
-	int channel_in, int width, int height, int channel_out, int width_out,
-	int height_out, int kernel_w, int kernel_h, int stride, int pad, int batch_sz)
-	{
+		int channel_in, int width, int height, int channel_out, int width_out,
+		int height_out, int kernel_w, int kernel_h, int stride, int pad,
+		int batch_sz) {
 }
 template void ocl_conv<float>(float* bottom_data, float* top_data,
-	float* weights, float* bias, int channel_in, int width, int height,
-	int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
-	int stride, int pad, int batch_sz);
+		float* weights, float* bias, int channel_in, int width, int height,
+		int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+		int stride, int pad, int batch_sz);
 template void ocl_conv<double>(double* bottom_data, double* top_data,
-	double* weights, double* bias, int channel_in, int width, int height,
-	int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
-	int stride, int pad, int batch_sz);
+		double* weights, double* bias, int channel_in, int width, int height,
+		int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+		int stride, int pad, int batch_sz);
 
 }  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index f4373901..da533cd9 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -30,7 +30,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) {
 }
 
 bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
-	NetParameter* net_param) {
+		NetParameter* net_param) {
 	// First upgrade padding layers to padded conv layers.
 	NetParameter v0_net_param;
 	UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
@@ -42,7 +42,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
 	}
 	for (int i = 0; i < v0_net_param.layers_size(); ++i) {
 		is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
-			net_param->add_layers());
+				net_param->add_layers());
 	}
 	for (int i = 0; i < v0_net_param.input_size(); ++i) {
 		net_param->add_input(v0_net_param.input(i));
@@ -57,7 +57,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
 }
 
 void UpgradeV0PaddingLayers(const NetParameter& param,
-	NetParameter* param_upgraded_pad) {
+		NetParameter* param_upgraded_pad) {
 	// Copy everything other than the layers from the original param.
 	param_upgraded_pad->Clear();
 	param_upgraded_pad->CopyFrom(param);
@@ -78,7 +78,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
 		for (int j = 0; j < layer_connection.bottom_size(); ++j) {
 			const string& blob_name = layer_connection.bottom(j);
 			if (blob_name_to_last_top_idx.find(blob_name) ==
-				blob_name_to_last_top_idx.end()) {
+					blob_name_to_last_top_idx.end()) {
 				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
 			}
 			const int top_idx = blob_name_to_last_top_idx[blob_name];
@@ -92,20 +92,20 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
 				// the padding layer input has only one input and one output.  Other
 				// cases have undefined behavior in Caffe.
 				CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
-					<< "Padding layer input to "
-						"non-convolutional / non-pooling layer type "
-					<< layer_param.type();
+						<< "Padding layer input to "
+								"non-convolutional / non-pooling layer type "
+						<< layer_param.type();
 				CHECK_EQ(layer_connection.bottom_size(), 1)
-					<< "Conv Layer takes a single blob as input.";
+						<< "Conv Layer takes a single blob as input.";
 				CHECK_EQ(source_layer.bottom_size(), 1)
-					<< "Padding Layer takes a single blob as input.";
+						<< "Padding Layer takes a single blob as input.";
 				CHECK_EQ(source_layer.top_size(), 1)
-					<< "Padding Layer produces a single blob as output.";
+						<< "Padding Layer produces a single blob as output.";
 				int layer_index = param_upgraded_pad->layers_size() - 1;
 				param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()
-					->set_pad(source_layer.layer().pad());
+						->set_pad(source_layer.layer().pad());
 				param_upgraded_pad->mutable_layers(layer_index)
-					->set_bottom(j, source_layer.bottom(0));
+						->set_bottom(j, source_layer.bottom(0));
 			}
 		}
 		for (int j = 0; j < layer_connection.top_size(); ++j) {
@@ -116,7 +116,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
 }
 
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-	V1LayerParameter* layer_param) {
+		V1LayerParameter* layer_param) {
 	bool is_fully_compatible = true;
 	layer_param->Clear();
 	for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
@@ -146,10 +146,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_num_output()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->set_num_output(
-					v0_layer_param.num_output());
+						v0_layer_param.num_output());
 			} else if (type == "innerproduct") {
 				layer_param->mutable_inner_product_param()->set_num_output(
-					v0_layer_param.num_output());
+						v0_layer_param.num_output());
 			} else {
 				LOG(ERROR) << "Unknown parameter num_output for layer type " << type;
 				is_fully_compatible = false;
@@ -158,10 +158,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_biasterm()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->set_bias_term(
-					v0_layer_param.biasterm());
+						v0_layer_param.biasterm());
 			} else if (type == "innerproduct") {
 				layer_param->mutable_inner_product_param()->set_bias_term(
-					v0_layer_param.biasterm());
+						v0_layer_param.biasterm());
 			} else {
 				LOG(ERROR) << "Unknown parameter biasterm for layer type " << type;
 				is_fully_compatible = false;
@@ -170,10 +170,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_weight_filler()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->
-					mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+						mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
 			} else if (type == "innerproduct") {
 				layer_param->mutable_inner_product_param()->
-					mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+						mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
 			} else {
 				LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
 				is_fully_compatible = false;
@@ -182,10 +182,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_bias_filler()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->
-					mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+						mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
 			} else if (type == "innerproduct") {
 				layer_param->mutable_inner_product_param()->
-					mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+						mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
 			} else {
 				LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
 				is_fully_compatible = false;
@@ -204,10 +204,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_kernelsize()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->set_kernel_size(
-					v0_layer_param.kernelsize());
+						v0_layer_param.kernelsize());
 			} else if (type == "pool") {
 				layer_param->mutable_pooling_param()->set_kernel_size(
-					v0_layer_param.kernelsize());
+						v0_layer_param.kernelsize());
 			} else {
 				LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
 				is_fully_compatible = false;
@@ -216,7 +216,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_group()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->set_group(
-					v0_layer_param.group());
+						v0_layer_param.group());
 			} else {
 				LOG(ERROR) << "Unknown parameter group for layer type " << type;
 				is_fully_compatible = false;
@@ -225,10 +225,10 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_stride()) {
 			if (type == "conv") {
 				layer_param->mutable_convolution_param()->set_stride(
-					v0_layer_param.stride());
+						v0_layer_param.stride());
 			} else if (type == "pool") {
 				layer_param->mutable_pooling_param()->set_stride(
-					v0_layer_param.stride());
+						v0_layer_param.stride());
 			} else {
 				LOG(ERROR) << "Unknown parameter stride for layer type " << type;
 				is_fully_compatible = false;
@@ -240,15 +240,15 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 				switch (pool) {
 					case V0LayerParameter_PoolMethod_MAX:
 						layer_param->mutable_pooling_param()->set_pool(
-							PoolingParameter_PoolMethod_MAX);
+								PoolingParameter_PoolMethod_MAX);
 						break;
 					case V0LayerParameter_PoolMethod_AVE:
 						layer_param->mutable_pooling_param()->set_pool(
-							PoolingParameter_PoolMethod_AVE);
+								PoolingParameter_PoolMethod_AVE);
 						break;
 					case V0LayerParameter_PoolMethod_STOCHASTIC:
 						layer_param->mutable_pooling_param()->set_pool(
-							PoolingParameter_PoolMethod_STOCHASTIC);
+								PoolingParameter_PoolMethod_STOCHASTIC);
 						break;
 					default:
 						LOG(ERROR) << "Unknown pool method " << pool;
@@ -262,7 +262,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_dropout_ratio()) {
 			if (type == "dropout") {
 				layer_param->mutable_dropout_param()->set_dropout_ratio(
-					v0_layer_param.dropout_ratio());
+						v0_layer_param.dropout_ratio());
 			} else {
 				LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type;
 				is_fully_compatible = false;
@@ -271,7 +271,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_local_size()) {
 			if (type == "lrn") {
 				layer_param->mutable_lrn_param()->set_local_size(
-					v0_layer_param.local_size());
+						v0_layer_param.local_size());
 			} else {
 				LOG(ERROR) << "Unknown parameter local_size for layer type " << type;
 				is_fully_compatible = false;
@@ -306,16 +306,16 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 				layer_param->mutable_data_param()->set_source(v0_layer_param.source());
 			} else if (type == "hdf5_data") {
 				layer_param->mutable_hdf5_data_param()->set_source(
-					v0_layer_param.source());
+						v0_layer_param.source());
 			} else if (type == "images") {
 				layer_param->mutable_image_data_param()->set_source(
-					v0_layer_param.source());
+						v0_layer_param.source());
 			} else if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_source(
-					v0_layer_param.source());
+						v0_layer_param.source());
 			} else if (type == "infogain_loss") {
 				layer_param->mutable_infogain_loss_param()->set_source(
-					v0_layer_param.source());
+						v0_layer_param.source());
 			} else {
 				LOG(ERROR) << "Unknown parameter source for layer type " << type;
 				is_fully_compatible = false;
@@ -323,25 +323,25 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		}
 		if (v0_layer_param.has_scale()) {
 			layer_param->mutable_transform_param()->
-				set_scale(v0_layer_param.scale());
+					set_scale(v0_layer_param.scale());
 		}
 		if (v0_layer_param.has_meanfile()) {
 			layer_param->mutable_transform_param()->
-				set_mean_file(v0_layer_param.meanfile());
+					set_mean_file(v0_layer_param.meanfile());
 		}
 		if (v0_layer_param.has_batchsize()) {
 			if (type == "data") {
 				layer_param->mutable_data_param()->set_batch_size(
-					v0_layer_param.batchsize());
+						v0_layer_param.batchsize());
 			} else if (type == "hdf5_data") {
 				layer_param->mutable_hdf5_data_param()->set_batch_size(
-					v0_layer_param.batchsize());
+						v0_layer_param.batchsize());
 			} else if (type == "images") {
 				layer_param->mutable_image_data_param()->set_batch_size(
-					v0_layer_param.batchsize());
+						v0_layer_param.batchsize());
 			} else if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_batch_size(
-					v0_layer_param.batchsize());
+						v0_layer_param.batchsize());
 			} else {
 				LOG(ERROR) << "Unknown parameter batchsize for layer type " << type;
 				is_fully_compatible = false;
@@ -349,19 +349,19 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		}
 		if (v0_layer_param.has_cropsize()) {
 			layer_param->mutable_transform_param()->
-				set_crop_size(v0_layer_param.cropsize());
+					set_crop_size(v0_layer_param.cropsize());
 		}
 		if (v0_layer_param.has_mirror()) {
 			layer_param->mutable_transform_param()->
-				set_mirror(v0_layer_param.mirror());
+					set_mirror(v0_layer_param.mirror());
 		}
 		if (v0_layer_param.has_rand_skip()) {
 			if (type == "data") {
 				layer_param->mutable_data_param()->set_rand_skip(
-					v0_layer_param.rand_skip());
+						v0_layer_param.rand_skip());
 			} else if (type == "images") {
 				layer_param->mutable_image_data_param()->set_rand_skip(
-					v0_layer_param.rand_skip());
+						v0_layer_param.rand_skip());
 			} else {
 				LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type;
 				is_fully_compatible = false;
@@ -370,7 +370,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_shuffle_images()) {
 			if (type == "images") {
 				layer_param->mutable_image_data_param()->set_shuffle(
-					v0_layer_param.shuffle_images());
+						v0_layer_param.shuffle_images());
 			} else {
 				LOG(ERROR) << "Unknown parameter shuffle for layer type " << type;
 				is_fully_compatible = false;
@@ -379,7 +379,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_new_height()) {
 			if (type == "images") {
 				layer_param->mutable_image_data_param()->set_new_height(
-					v0_layer_param.new_height());
+						v0_layer_param.new_height());
 			} else {
 				LOG(ERROR) << "Unknown parameter new_height for layer type " << type;
 				is_fully_compatible = false;
@@ -388,7 +388,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_new_width()) {
 			if (type == "images") {
 				layer_param->mutable_image_data_param()->set_new_width(
-					v0_layer_param.new_width());
+						v0_layer_param.new_width());
 			} else {
 				LOG(ERROR) << "Unknown parameter new_width for layer type " << type;
 				is_fully_compatible = false;
@@ -397,7 +397,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_concat_dim()) {
 			if (type == "concat") {
 				layer_param->mutable_concat_param()->set_concat_dim(
-					v0_layer_param.concat_dim());
+						v0_layer_param.concat_dim());
 			} else {
 				LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type;
 				is_fully_compatible = false;
@@ -406,60 +406,60 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
 		if (v0_layer_param.has_det_fg_threshold()) {
 			if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_fg_threshold(
-					v0_layer_param.det_fg_threshold());
+						v0_layer_param.det_fg_threshold());
 			} else {
 				LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
 		if (v0_layer_param.has_det_bg_threshold()) {
 			if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_bg_threshold(
-					v0_layer_param.det_bg_threshold());
+						v0_layer_param.det_bg_threshold());
 			} else {
 				LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
 		if (v0_layer_param.has_det_fg_fraction()) {
 			if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_fg_fraction(
-					v0_layer_param.det_fg_fraction());
+						v0_layer_param.det_fg_fraction());
 			} else {
 				LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
 		if (v0_layer_param.has_det_context_pad()) {
 			if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_context_pad(
-					v0_layer_param.det_context_pad());
+						v0_layer_param.det_context_pad());
 			} else {
 				LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
 		if (v0_layer_param.has_det_crop_mode()) {
 			if (type == "window_data") {
 				layer_param->mutable_window_data_param()->set_crop_mode(
-					v0_layer_param.det_crop_mode());
+						v0_layer_param.det_crop_mode());
 			} else {
 				LOG(ERROR) << "Unknown parameter det_crop_mode for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
 		if (v0_layer_param.has_hdf5_output_param()) {
 			if (type == "hdf5_output") {
 				layer_param->mutable_hdf5_output_param()->CopyFrom(
-					v0_layer_param.hdf5_output_param());
+						v0_layer_param.hdf5_output_param());
 			} else {
 				LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
-					<< type;
+						<< type;
 				is_fully_compatible = false;
 			}
 		}
@@ -613,42 +613,42 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
 		// NetParameter was specified using the old style (V0LayerParameter); try to
 		// upgrade it.
 		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-			<< "V0LayerParameter: " << param_file;
+				<< "V0LayerParameter: " << param_file;
 		NetParameter original_param(*param);
 		if (!UpgradeV0Net(original_param, param)) {
 			success = false;
 			LOG(ERROR) << "Warning: had one or more problems upgrading "
-				<< "V0NetParameter to NetParameter (see above); continuing anyway.";
+					<< "V0NetParameter to NetParameter (see above); continuing anyway.";
 		} else {
 			LOG(INFO) << "Successfully upgraded file specified using deprecated "
-				<< "V0LayerParameter";
+					<< "V0LayerParameter";
 		}
 		LOG(ERROR) << "Note that future Caffe releases will not support "
-			<< "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
-			<< "prototxt and ./build/tools/upgrade_net_proto_binary for model "
-			<< "weights upgrade this and any other net protos to the new format.";
+				<< "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
+				<< "prototxt and ./build/tools/upgrade_net_proto_binary for model "
+				<< "weights upgrade this and any other net protos to the new format.";
 	}
 	// NetParameter uses old style data transformation fields; try to upgrade it.
 	if (NetNeedsDataUpgrade(*param)) {
 		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-			<< "transformation parameters: " << param_file;
+				<< "transformation parameters: " << param_file;
 		UpgradeNetDataTransformation(param);
 		LOG(INFO) << "Successfully upgraded file specified using deprecated "
-			<< "data transformation parameters.";
+				<< "data transformation parameters.";
 		LOG(ERROR) << "Note that future Caffe releases will only support "
-			<< "transform_param messages for transformation fields.";
+				<< "transform_param messages for transformation fields.";
 	}
 	if (NetNeedsV1ToV2Upgrade(*param)) {
 		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-			<< "V1LayerParameter: " << param_file;
+				<< "V1LayerParameter: " << param_file;
 		NetParameter original_param(*param);
 		if (!UpgradeV1Net(original_param, param)) {
 			success = false;
 			LOG(ERROR) << "Warning: had one or more problems upgrading "
-				<< "V1LayerParameter (see above); continuing anyway.";
+					<< "V1LayerParameter (see above); continuing anyway.";
 		} else {
 			LOG(INFO) << "Successfully upgraded file specified using deprecated "
-				<< "V1LayerParameter";
+					<< "V1LayerParameter";
 		}
 	}
 	return success;
@@ -658,7 +658,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
 	bool is_fully_compatible = true;
 	if (v1_net_param.layer_size() > 0) {
 		LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
-			<< "fields; these will be ignored for the upgrade.";
+				<< "fields; these will be ignored for the upgrade.";
 		is_fully_compatible = false;
 	}
 	net_param->CopyFrom(v1_net_param);
@@ -666,7 +666,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
 	net_param->clear_layer();
 	for (int i = 0; i < v1_net_param.layers_size(); ++i) {
 		if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
-			net_param->add_layer())) {
+				net_param->add_layer())) {
 			LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
 			is_fully_compatible = false;
 		}
@@ -675,7 +675,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
 }
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-	LayerParameter* layer_param) {
+		LayerParameter* layer_param) {
 	layer_param->Clear();
 	bool is_fully_compatible = true;
 	for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
@@ -719,7 +719,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
 				break;
 			default:
 				LOG(FATAL) << "Unknown blob_share_mode: "
-					<< v1_layer_param.blob_share_mode(i);
+						<< v1_layer_param.blob_share_mode(i);
 				break;
 		}
 		layer_param->mutable_param(i)->set_share_mode(mode);
@@ -735,130 +735,130 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
 			layer_param->add_param();
 		}
 		layer_param->mutable_param(i)->set_decay_mult(
-			v1_layer_param.weight_decay(i));
+				v1_layer_param.weight_decay(i));
 	}
 	for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) {
 		layer_param->add_loss_weight(v1_layer_param.loss_weight(i));
 	}
 	if (v1_layer_param.has_accuracy_param()) {
 		layer_param->mutable_accuracy_param()->CopyFrom(
-			v1_layer_param.accuracy_param());
+				v1_layer_param.accuracy_param());
 	}
 	if (v1_layer_param.has_argmax_param()) {
 		layer_param->mutable_argmax_param()->CopyFrom(
-			v1_layer_param.argmax_param());
+				v1_layer_param.argmax_param());
 	}
 	if (v1_layer_param.has_concat_param()) {
 		layer_param->mutable_concat_param()->CopyFrom(
-			v1_layer_param.concat_param());
+				v1_layer_param.concat_param());
 	}
 	if (v1_layer_param.has_contrastive_loss_param()) {
 		layer_param->mutable_contrastive_loss_param()->CopyFrom(
-			v1_layer_param.contrastive_loss_param());
+				v1_layer_param.contrastive_loss_param());
 	}
 	if (v1_layer_param.has_convolution_param()) {
 		layer_param->mutable_convolution_param()->CopyFrom(
-			v1_layer_param.convolution_param());
+				v1_layer_param.convolution_param());
 	}
 	if (v1_layer_param.has_data_param()) {
 		layer_param->mutable_data_param()->CopyFrom(
-			v1_layer_param.data_param());
+				v1_layer_param.data_param());
 	}
 	if (v1_layer_param.has_dropout_param()) {
 		layer_param->mutable_dropout_param()->CopyFrom(
-			v1_layer_param.dropout_param());
+				v1_layer_param.dropout_param());
 	}
 	if (v1_layer_param.has_dummy_data_param()) {
 		layer_param->mutable_dummy_data_param()->CopyFrom(
-			v1_layer_param.dummy_data_param());
+				v1_layer_param.dummy_data_param());
 	}
 	if (v1_layer_param.has_eltwise_param()) {
 		layer_param->mutable_eltwise_param()->CopyFrom(
-			v1_layer_param.eltwise_param());
+				v1_layer_param.eltwise_param());
 	}
 	if (v1_layer_param.has_exp_param()) {
 		layer_param->mutable_exp_param()->CopyFrom(
-			v1_layer_param.exp_param());
+				v1_layer_param.exp_param());
 	}
 	if (v1_layer_param.has_hdf5_data_param()) {
 		layer_param->mutable_hdf5_data_param()->CopyFrom(
-			v1_layer_param.hdf5_data_param());
+				v1_layer_param.hdf5_data_param());
 	}
 	if (v1_layer_param.has_hdf5_output_param()) {
 		layer_param->mutable_hdf5_output_param()->CopyFrom(
-			v1_layer_param.hdf5_output_param());
+				v1_layer_param.hdf5_output_param());
 	}
 	if (v1_layer_param.has_hinge_loss_param()) {
 		layer_param->mutable_hinge_loss_param()->CopyFrom(
-			v1_layer_param.hinge_loss_param());
+				v1_layer_param.hinge_loss_param());
 	}
 	if (v1_layer_param.has_image_data_param()) {
 		layer_param->mutable_image_data_param()->CopyFrom(
-			v1_layer_param.image_data_param());
+				v1_layer_param.image_data_param());
 	}
 	if (v1_layer_param.has_infogain_loss_param()) {
 		layer_param->mutable_infogain_loss_param()->CopyFrom(
-			v1_layer_param.infogain_loss_param());
+				v1_layer_param.infogain_loss_param());
 	}
 	if (v1_layer_param.has_inner_product_param()) {
 		layer_param->mutable_inner_product_param()->CopyFrom(
-			v1_layer_param.inner_product_param());
+				v1_layer_param.inner_product_param());
 	}
 	if (v1_layer_param.has_lrn_param()) {
 		layer_param->mutable_lrn_param()->CopyFrom(
-			v1_layer_param.lrn_param());
+				v1_layer_param.lrn_param());
 	}
 	if (v1_layer_param.has_memory_data_param()) {
 		layer_param->mutable_memory_data_param()->CopyFrom(
-			v1_layer_param.memory_data_param());
+				v1_layer_param.memory_data_param());
 	}
 	if (v1_layer_param.has_mvn_param()) {
 		layer_param->mutable_mvn_param()->CopyFrom(
-			v1_layer_param.mvn_param());
+				v1_layer_param.mvn_param());
 	}
 	if (v1_layer_param.has_pooling_param()) {
 		layer_param->mutable_pooling_param()->CopyFrom(
-			v1_layer_param.pooling_param());
+				v1_layer_param.pooling_param());
 	}
 	if (v1_layer_param.has_power_param()) {
 		layer_param->mutable_power_param()->CopyFrom(
-			v1_layer_param.power_param());
+				v1_layer_param.power_param());
 	}
 	if (v1_layer_param.has_relu_param()) {
 		layer_param->mutable_relu_param()->CopyFrom(
-			v1_layer_param.relu_param());
+				v1_layer_param.relu_param());
 	}
 	if (v1_layer_param.has_sigmoid_param()) {
 		layer_param->mutable_sigmoid_param()->CopyFrom(
-			v1_layer_param.sigmoid_param());
+				v1_layer_param.sigmoid_param());
 	}
 	if (v1_layer_param.has_softmax_param()) {
 		layer_param->mutable_softmax_param()->CopyFrom(
-			v1_layer_param.softmax_param());
+				v1_layer_param.softmax_param());
 	}
 	if (v1_layer_param.has_slice_param()) {
 		layer_param->mutable_slice_param()->CopyFrom(
-			v1_layer_param.slice_param());
+				v1_layer_param.slice_param());
 	}
 	if (v1_layer_param.has_tanh_param()) {
 		layer_param->mutable_tanh_param()->CopyFrom(
-			v1_layer_param.tanh_param());
+				v1_layer_param.tanh_param());
 	}
 	if (v1_layer_param.has_threshold_param()) {
 		layer_param->mutable_threshold_param()->CopyFrom(
-			v1_layer_param.threshold_param());
+				v1_layer_param.threshold_param());
 	}
 	if (v1_layer_param.has_window_data_param()) {
 		layer_param->mutable_window_data_param()->CopyFrom(
-			v1_layer_param.window_data_param());
+				v1_layer_param.window_data_param());
 	}
 	if (v1_layer_param.has_transform_param()) {
 		layer_param->mutable_transform_param()->CopyFrom(
-			v1_layer_param.transform_param());
+				v1_layer_param.transform_param());
 	}
 	if (v1_layer_param.has_loss_param()) {
 		layer_param->mutable_loss_param()->CopyFrom(
-			v1_layer_param.loss_param());
+				v1_layer_param.loss_param());
 	}
 	if (v1_layer_param.has_layer()) {
 		LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
@@ -956,16 +956,16 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
 }
 
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-	NetParameter* param) {
+		NetParameter* param) {
 	CHECK(ReadProtoFromTextFile(param_file, param))
-		<< "Failed to parse NetParameter file: " << param_file;
+			<< "Failed to parse NetParameter file: " << param_file;
 	UpgradeNetAsNeeded(param_file, param);
 }
 
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-	NetParameter* param) {
+		NetParameter* param) {
 	CHECK(ReadProtoFromBinaryFile(param_file, param))
-		<< "Failed to parse NetParameter file: " << param_file;
+			<< "Failed to parse NetParameter file: " << param_file;
 	UpgradeNetAsNeeded(param_file, param);
 }
 

From 1e1bcd2a1be5502f2bd7c9cc3638cfd6e5b761dc Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 21:10:50 -0700
Subject: [PATCH 083/124] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8fadd98f..6f6cbd80 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL caffe is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 
 #Design features
   -All layers ported to OpenCL
   
-  -Aligned with CAFFE’s latest code
+  -Aligned with caffe’s latest code
 
   -Performance improvement by batched sgemm implementation for conv layer
 

From a8cb6de9bbfdd72ee182d990ac59c391c639b76f Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 22:50:42 -0700
Subject: [PATCH 084/124] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ef4ae50d..2dbe44f9 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Training speed (Model: AlexNet)
 
 -AMD W9100 (5.2TFLOPS), 255 images per second
 
--AMD R9 Fury((7.2TFLOPS)), 231 images per second
+-AMD R9 Fury((7.2TFLOPS)), 261 images per second
 
 Recognition speed (Model: AlexNet)
 

From d5cdc7a5edbd9ac809ae63eb2d9c59d62b461300 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 22:51:47 -0700
Subject: [PATCH 085/124] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6f6cbd80..fe91ccab 100644
--- a/README.md
+++ b/README.md
@@ -27,13 +27,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only
 
 We will keep updating the latest performance we could achieve in this section.
 
-* Training speed (Model: AlexNet)
+* Training speed (Model: AlexNet, minibatch size 128)
 
     -AMD W9100 (5.2TFLOPS), 255 images per second
 
-    -AMD R9 Fury((7.2TFLOPS)), 231 images per second
+    -AMD R9 Fury((7.2TFLOPS)), 261 images per second
 
-* Recognition speed (Model: AlexNet)
+* Recognition speed (Model: AlexNet, minibatch size 128)
 
     -AMD W9100 (5.2TFLOPS), 590 images per second
 

From 915fe5cd5a2f976b88e5149e78e658623b96f278 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 22:52:39 -0700
Subject: [PATCH 086/124] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2dbe44f9..77dae02f 100644
--- a/README.md
+++ b/README.md
@@ -23,13 +23,13 @@ Note: More featurs will be added in the near future. And this OpenCL caffe only
 
 We will keep updating the latest performance we could achieve in this section.
 
-Training speed (Model: AlexNet)
+Training speed (Model: AlexNet, minibatch size 128)
 
 -AMD W9100 (5.2TFLOPS), 255 images per second
 
 -AMD R9 Fury((7.2TFLOPS)), 261 images per second
 
-Recognition speed (Model: AlexNet)
+Recognition speed (Model: AlexNet, minibatch size 128)
 
 -AMD W9100 (5.2TFLOPS), 590 images per second
 

From 2ea828984c628c68900afbe302533a2c12c1166f Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 22:59:48 -0700
Subject: [PATCH 087/124] Update README.md

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 77dae02f..6280e182 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. 
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 
@@ -39,7 +39,10 @@ Recognition speed (Model: AlexNet, minibatch size 128)
 For more information on how to install, use or contribute to this code base, please visit our wiki page:
 https://github.com/amd/OpenCL-caffe/wiki
 
-#License and support
+#Support needed
+We encourage the contribution and support from the community to improve it together.
+
+#License 
 Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
 # Caffe

From ef00e37c66e8c9b7685c02f1a7da8628a31dde19 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 23:00:07 -0700
Subject: [PATCH 088/124] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6280e182..073e5515 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide industry an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. 
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL CAFFE is developed by AMD Research lab. As The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together. 
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 

From ce44b9e0194259695fc4eee3d185b15bb0cf7fd1 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 23:03:55 -0700
Subject: [PATCH 089/124] Update README.md

---
 README.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index fe91ccab..c11d9e66 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below. The OpenCL caffe is developed by AMD Research lab. As The goal is to provide industry an effecient and ready to use OpenCL version of DNN framework. Things are not perfect yet. We will keep adding new features and improving performance.
+This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below.  The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together.
 
 OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
 
@@ -11,13 +11,11 @@ OpenCL is an open standard parallel programming language that is supported by mo
 
   -Performance improvement by batched sgemm implementation for conv layer
 
-  -User can choose optimal batch number depening on H/W, image size and minibatch size
+  -User can choose optimal batch number depending on H/W, image size and minibatch size
 
-  -Passes unit test
-
-  -OpenCL 2.0, 1.2
+  -Supports OpenCL 2.0, 1.2
   
-  -Remove CUDA for simplicity, only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
+  -only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
 
   -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
 
@@ -43,7 +41,10 @@ We will keep updating the latest performance we could achieve in this section.
 For more information on how to install, use or contribute to this code base, please visit our wiki page:
 https://github.com/amd/OpenCL-caffe/wiki
 
-#License and support
+#Support needed
+We encourage the contribution and support from the community to improve it together.
+
+#License
 Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
 # Original Caffe information

From 44f67c18725bf223f02a5d37e4794e7886535da9 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 10 Sep 2015 14:18:35 +0800
Subject: [PATCH 090/124] Fixed the bug in kernel_channel_sum(), passed throug
 softmaxwithloss, validated the loss output in log file

---
 src/caffe/layers/softmax_loss_layer.cpp |  1 -
 src/caffe/solver.cpp                    |  4 ----
 src/caffe/util/ocl_wrapper.cpp          | 12 ++++++------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 58872a72..86a0d37a 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -160,7 +160,6 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
 	} else {
 		loss /= outer_num_;
 	}
-	printf("loss = %f\n", loss);
 	top[0]->mutable_cpu_data()[0] = loss;
 	if (top.size() == 2) {
 		top[1]->ShareData(prob_);
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 0a07a218..ffb77b78 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -229,11 +229,7 @@ void Solver<Dtype>::Step(int iters) {
 			int idx = (iter_ - start_iter) % average_loss;
 			smoothed_loss += (loss - losses[idx]) / average_loss;
 			losses[idx] = loss;
-			printf("smoothed_loss = %f, losses[idx] = %f, idx = %d\n", smoothed_loss,
-					losses[idx], idx);
 		}
-		printf("smoothed_loss = %f, ave_loss = %d, losses.size() = %lu \n",
-				smoothed_loss, average_loss, losses.size());
 		if (display) {
 			LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
 			const vector<Blob<Dtype>*>& result = net_->output_blobs();
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 8eb1a981..d54fd01e 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -424,7 +424,7 @@ void kernel_channel_sum(const int num, const int channels,
 	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
 	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
 
-	size_t Global_Work_Size[1] = { (size_t)(num * channels) };
+	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
 	size_t Local_Work_Size[1] = { 256 };
 	OCL_CHECK(
 			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
@@ -499,7 +499,8 @@ void SoftmaxLossForwardGPU(const int nthreads,
 		Dtype* counts) {
 	std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
+        
+        int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data));
 	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
@@ -507,8 +508,7 @@ void SoftmaxLossForwardGPU(const int nthreads,
 	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(
-			clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
 	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
 
@@ -536,6 +536,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
 	std::string kernel_name = "SoftmaxLossBackwardGPU"
 			+ get_dtype_suffix<Dtype>();
 	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+        int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
 
 	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
 	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top));
@@ -544,8 +545,7 @@ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
 	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
 	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
 	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(
-			clSetKernelArg(Kernel, 7, sizeof(cl_bool), (void*) &has_ignore_label_));
+	OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
 	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
 	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
 

From f8fb6d3b159b3802fb34b947dbf9ad52b1ae44f8 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 9 Sep 2015 23:42:23 -0700
Subject: [PATCH 091/124] Update README.md

---
 README.md | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index c11d9e66..2c9a0ef1 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,9 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of one of the main stream DNN framework-CAFFE, see more details about CAFFE below.  The goal is to provide the community an OpenCL version of DNN framework to use. Things are not perfect yet. We will keep adding new features and improving performance. We also hope to get help from community to improve it together.
-
-OpenCL is an open standard parallel programming language that is supported by more than 20 companies. People can use this framework to run their DNN app on heterogeneous platforms from vairous commercial chip manufacturer. Compared to CUDA based DNN, this framework support cross-platform compatability and with design space to optimize accordingly.
+This is an OpenCL implementation of the popular caffe DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
 
 #Design features
   -All layers ported to OpenCL
-  
-  -Aligned with caffe’s latest code
 
   -Performance improvement by batched sgemm implementation for conv layer
 
@@ -15,27 +11,27 @@ OpenCL is an open standard parallel programming language that is supported by mo
 
   -Supports OpenCL 2.0, 1.2
   
-  -only contains C++ and OpenCL, maintains the same interfaces as original caffe to make it easy for caffe users
+  -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users
 
   -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
 
-Note: More featurs will be added in the near future. And this OpenCL caffe only verifies on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to add if there is a need.
+Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future.
 
 #Performance
 
-We will keep updating the latest performance we could achieve in this section.
+We will keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved.
 
 * Training speed (Model: AlexNet, minibatch size 128)
 
-    -AMD W9100 (5.2TFLOPS), 255 images per second
+    -AMD W9100, 255 images per second
 
-    -AMD R9 Fury((7.2TFLOPS)), 261 images per second
+    -AMD R9 Fury, 261 images per second
 
 * Recognition speed (Model: AlexNet, minibatch size 128)
 
-    -AMD W9100 (5.2TFLOPS), 590 images per second
+    -AMD W9100, 590 images per second
 
-    -AMD R9 Fury((7.2TFLOPS)), 699 images per second
+    -AMD R9 Fury, 699 images per second
 
 #Wiki
 For more information on how to install, use or contribute to this code base, please visit our wiki page:

From fe779dfec920dc703387edaafdebd28e8ff339ac Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 10 Sep 2015 15:32:06 +0800
Subject: [PATCH 092/124] Added rng_uniform rng_gaussian

---
 include/caffe/util/ocl_wrapper.hpp |   6 ++
 src/caffe/ocl/random.cl            | 107 ++++++++++++++++++++++++++++-
 src/caffe/util/math_functions.cpp  |   4 ++
 src/caffe/util/ocl_wrapper.cpp     |  55 +++++++++++++++
 4 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index a15b68ff..290ef30f 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -145,6 +145,12 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dtype threshold);
 
+template <typename Dtype>
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup);
+
+template <typename Dtype>
+void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V);
+
 template <typename Dtype>
 void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y );
 
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index f5a7a4db..9fbb59d7 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -707,7 +707,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_
 } 
 
 template <class T>
-__kernel void PRNG_threefry4x32(
+__kernel void PRNG_threefry4x32_bernoulli(
         __global uint4 *randomnumber,
         threefry4x32_ctr_t ctr_i,
         T inf,
@@ -744,9 +744,110 @@ __kernel void PRNG_threefry4x32(
 }
 
 
-template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
+template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
 
-template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
+template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
 
 //end of the looooooong gpu_random_generator kernel 
 
+template <class T>
+__kernel void PRNG_threefry4x32_uniform(
+        __global float4 *randomnumber,
+        threefry4x32_ctr_t ctr_i,
+        T inf,
+        T sup,
+        uint nrounds,
+        uint numrandom
+){
+        size_t  gdx = get_global_id(0);
+
+        uint maxUint = 0;
+        maxUint--;
+        float r = (float)maxUint;
+
+        threefry4x32_ctr_t      ctr = ctr_i; 
+        threefry4x32_ukey_t ukey;
+
+        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+        threefry4x32_ctr_t  random4;
+
+        if ( gdx < numrandom )
+        {
+                random4 = threefry4x32_R(nrounds, ctr, ukey);
+                float4 frnd;
+                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
+                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
+                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
+                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
+                randomnumber[gdx] = frnd;
+        }
+}
+
+template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm);
+
+template <class T>
+__kernel void PRNG_threefry4x32_gaussian(
+	__global float4 *randomnumber, 
+	threefry4x32_ctr_t ctr_i,
+	float E,
+	float V,
+	uint nrounds,
+	uint numrandom
+){
+	size_t	gdx = get_global_id(0);
+
+	uint maxUint = 0;
+	maxUint--;
+	float r = (float)maxUint;
+
+	threefry4x32_ctr_t	ctr = ctr_i; 
+	threefry4x32_ukey_t ukey1, ukey2;
+
+	ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
+	ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
+
+	threefry4x32_ctr_t  random1, random2;
+
+	if ( gdx < numrandom )
+	{
+		random1 = threefry4x32_R(nrounds, ctr, ukey1);
+		random2 = threefry4x32_R(nrounds, ctr, ukey2);
+		float4 frnd1;
+
+		float r1 = (((float)random1.v[0]) / r);          // generate a random sequence of uniform distribution
+		float r2 = (((float)random2.v[0]) / r);
+		float r3 = (((float)random1.v[1]) / r);
+		float r4 = (((float)random2.v[1]) / r);
+		float r5 = (((float)random1.v[2]) / r);
+		float r6 = (((float)random2.v[2]) / r);
+		float r7 = (((float)random1.v[3]) / r);
+		float r8 = (((float)random2.v[3]) / r);
+
+		if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0){
+			r2 += 0.0001;
+			r4 += 0.0001;
+			r6 += 0.0001;
+			r8 += 0.0001;
+		}
+
+		frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
+		//frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+		frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
+		//frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+		frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
+		//frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+		frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
+		//frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+
+		randomnumber[gdx] = frnd1;
+	}
+}
+
+template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm);
+
+
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index ed71edf6..c76703fb 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -801,20 +801,24 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
 template <>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
                                   float* r) {
+	caffe_gpu_uniform(r, n, a, b);	// r is a cl_mem object
 }
 template <>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
                                    double* r) {
+	caffe_gpu_uniform(r, n, a, b);  // r is a cl_mem object
 }
 
 template <>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
                             float* r) {
+	caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
 }
 
 template <>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
                             double* r) {
+	caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index c8f28426..73417ce8 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -64,6 +64,61 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, Dty
 template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n, float inf, float sup, float threshold);
 template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n, double inf, double sup, double threshold);
 
+template <typename Dtype>
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
+{
+        std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup);
+template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup);
+
+template <typename Dtype>
+void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V)
+{
+        std::string kernel_name = "RNGGaussian" + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&E);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&V);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_gaussian<float>(float* a, const unsigned int n, float E, float V);
+template void caffe_gpu_gaussian<double>(double* a, const unsigned int n, double E, double V);
 
 template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, const int M_, const int packing_num){

From e42eeaedf968d2edc90751af308a60e1fa46ebca Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 10 Sep 2015 16:37:50 +0800
Subject: [PATCH 093/124] fix a template error in random.cl

---
 src/caffe/ocl/random.cl           | 359 +++++++++++-------------------
 src/caffe/util/math_functions.cpp |   1 +
 2 files changed, 134 insertions(+), 226 deletions(-)

diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index da6c698e..058f41d7 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -30,26 +30,23 @@
 //we use the open sourced threefry's GPU implementation
 typedef uint uint32_t;
 
-struct r123array4x32 {
-		uint32_t v[4];
-};
+struct r123array4x32 {	uint32_t v[4]; };
 
-enum r123_enum_threefry32x4
+enum r123_enum_threefry32x4 
 {
 	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
 	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
 	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
-	R_32x4_3_0 = 23, R_32x4_3_1 = 5,
-	R_32x4_4_0 = 6, R_32x4_4_1 = 20,
+	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
+	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
 	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
 	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
 	R_32x4_7_0 = 18, R_32x4_7_1 = 20
 };
 
+inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
 inline uint32_t RotL_32(uint32_t x, unsigned int N)
-		__attribute__((always_inline));
-inline uint32_t RotL_32(uint32_t x, unsigned int N)
-		{
+{
 	return (x << (N & 31)) | (x >> ((32 - N) & 31));
 }
 
@@ -57,22 +54,20 @@ typedef struct r123array4x32 threefry4x32_ctr_t;
 typedef struct r123array4x32 threefry4x32_key_t;
 typedef struct r123array4x32 threefry4x32_ukey_t;
 
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
-		threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
-		threefry4x32_ctr_t in, threefry4x32_key_t k)
-		{
-	threefry4x32_ctr_t X;
-	uint32_t ks[4 + 1];
-	int i;
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
+{
+	threefry4x32_ctr_t	X;
+	uint32_t			ks[4 + 1];
+	int					i;
 	ks[4] = 0x1BD11BDA;
 	/*
-	 for (i = 0; i < 4; i++)
-	 {
-	 ks[i] = k.v[i];
-	 X.v[i] = in.v[i];
-	 ks[4] ^= k.v[i];
-	 }*/
+	for (i = 0; i < 4; i++)
+	{
+		ks[i] = k.v[i];
+		X.v[i] = in.v[i];
+		ks[4] ^= k.v[i];
+	}*/ 
 	{
 		ks[0] = k.v[0];
 		X.v[0] = in.v[0];
@@ -94,711 +89,622 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
 	X.v[1] += ks[1];
 	X.v[2] += ks[2];
 	X.v[3] += ks[3];
-	if (Nrounds > 0)
-			{
+	if (Nrounds > 0) 
+	{
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 1) {
+	} if (Nrounds > 1) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 2) {
+	} if (Nrounds > 2) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 3) {
+	} if (Nrounds > 3) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 3) {
+	} if (Nrounds > 3) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 1;
-	}
-	if (Nrounds > 4) {
+	} if (Nrounds > 4) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 5) {
+	} if (Nrounds > 5) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 6) {
+	} if (Nrounds > 6) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 7) {
+	} if (Nrounds > 7) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 7) {
+	} if (Nrounds > 7) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 2;
-	}
-	if (Nrounds > 8) {
+	} if (Nrounds > 8) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 9) {
+	} if (Nrounds > 9) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 10) {
+	} if (Nrounds > 10) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 11) {
+	} if (Nrounds > 11) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 11) {
+	} if (Nrounds > 11) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 3;
-	}
-	if (Nrounds > 12) {
+	} if (Nrounds > 12) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 13) {
+	} if (Nrounds > 13) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 14) {
+	} if (Nrounds > 14) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 15) {
+	} if (Nrounds > 15) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 15) {
+	} if (Nrounds > 15) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 4;
-	}
-	if (Nrounds > 16) {
+	} if (Nrounds > 16) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 17) {
+	} if (Nrounds > 17) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 18) {
+	} if (Nrounds > 18) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 19) {
+	} if (Nrounds > 19) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 19) {
+	} if (Nrounds > 19) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 5;
-	}
-	if (Nrounds > 20) {
+	} if (Nrounds > 20) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 21) {
+	} if (Nrounds > 21) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 22) {
+	} if (Nrounds > 22) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 23) {
+	} if (Nrounds > 23) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 23) {
+	} if (Nrounds > 23) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 6;
-	}
-	if (Nrounds > 24) {
+	} if (Nrounds > 24) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 25) {
+	} if (Nrounds > 25) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 26) {
+	} if (Nrounds > 26) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 27) {
+	} if (Nrounds > 27) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 27) {
+	} if (Nrounds > 27) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 7;
-	}
-	if (Nrounds > 28) {
+	} if (Nrounds > 28) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 29) {
+	} if (Nrounds > 29) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 30) {
+	} if (Nrounds > 30) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 31) {
+	} if (Nrounds > 31) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 31) {
+	} if (Nrounds > 31) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 8;
-	}
-	if (Nrounds > 32) {
+	} if (Nrounds > 32) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 33) {
+	} if (Nrounds > 33) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 34) {
+	} if (Nrounds > 34) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 35) {
+	} if (Nrounds > 35) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 35) {
+	} if (Nrounds > 35) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 9;
-	}
-	if (Nrounds > 36) {
+	} if (Nrounds > 36) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 37) {
+	} if (Nrounds > 37) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 38) {
+	} if (Nrounds > 38) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 39) {
+	} if (Nrounds > 39) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 39) {
+	} if (Nrounds > 39) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 10;
-	}
-	if (Nrounds > 40) {
+	} if (Nrounds > 40) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 41) {
+	} if (Nrounds > 41) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 42) {
+	} if (Nrounds > 42) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 43) {
+	} if (Nrounds > 43) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 43) {
+	} if (Nrounds > 43) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 11;
-	}
-	if (Nrounds > 44) {
+	} if (Nrounds > 44) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 45) {
+	} if (Nrounds > 45) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 46) {
+	} if (Nrounds > 46) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 47) {
+	} if (Nrounds > 47) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 47) {
+	} if (Nrounds > 47) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 12;
-	}
-	if (Nrounds > 48) {
+	} if (Nrounds > 48) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 49) {
+	} if (Nrounds > 49) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 50) {
+	} if (Nrounds > 50) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 51) {
+	} if (Nrounds > 51) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 51) {
+	} if (Nrounds > 51) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 13;
-	}
-	if (Nrounds > 52) {
+	} if (Nrounds > 52) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 53) {
+	} if (Nrounds > 53) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 54) {
+	} if (Nrounds > 54) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 55) {
+	} if (Nrounds > 55) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 55) {
+	} if (Nrounds > 55) {
 		X.v[0] += ks[4];
 		X.v[1] += ks[0];
 		X.v[2] += ks[1];
 		X.v[3] += ks[2];
 		X.v[4 - 1] += 14;
-	}
-	if (Nrounds > 56) {
+	} if (Nrounds > 56) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 57) {
+	} if (Nrounds > 57) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 58) {
+	} if (Nrounds > 58) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 59) {
+	} if (Nrounds > 59) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 59) {
+	} if (Nrounds > 59) {
 		X.v[0] += ks[0];
 		X.v[1] += ks[1];
 		X.v[2] += ks[2];
 		X.v[3] += ks[3];
 		X.v[4 - 1] += 15;
-	}
-	if (Nrounds > 60) {
+	} if (Nrounds > 60) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 61) {
+	} if (Nrounds > 61) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 62) {
+	} if (Nrounds > 62) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 63) {
+	} if (Nrounds > 63) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 63) {
+	} if (Nrounds > 63) {
 		X.v[0] += ks[1];
 		X.v[1] += ks[2];
 		X.v[2] += ks[3];
 		X.v[3] += ks[4];
 		X.v[4 - 1] += 16;
-	}
-	if (Nrounds > 64) {
+	} if (Nrounds > 64) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 65) {
+	} if (Nrounds > 65) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 66) {
+	} if (Nrounds > 66) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 67) {
+	} if (Nrounds > 67) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 67) {
+	} if (Nrounds > 67) {
 		X.v[0] += ks[2];
 		X.v[1] += ks[3];
 		X.v[2] += ks[4];
 		X.v[3] += ks[0];
 		X.v[4 - 1] += 17;
-	}
-	if (Nrounds > 68) {
+	} if (Nrounds > 68) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 69) {
+	} if (Nrounds > 69) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 70) {
+	} if (Nrounds > 70) {
 		X.v[0] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
 		X.v[1] ^= X.v[0];
 		X.v[2] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
 		X.v[3] ^= X.v[2];
-	}
-	if (Nrounds > 71) {
+	} if (Nrounds > 71) {
 		X.v[0] += X.v[3];
 		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
 		X.v[3] ^= X.v[0];
 		X.v[2] += X.v[1];
 		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
 		X.v[1] ^= X.v[2];
-	}
-	if (Nrounds > 71) {
+	} if (Nrounds > 71) {
 		X.v[0] += ks[3];
 		X.v[1] += ks[4];
 		X.v[2] += ks[0];
 		X.v[3] += ks[1];
 		X.v[4 - 1] += 18;
-	}
+	} 
 	return X;
-}
+} 
 
 template <class T>
 __kernel void PRNG_threefry4x32_bernoulli(
@@ -812,31 +718,32 @@ __kernel void PRNG_threefry4x32_bernoulli(
 ){
         size_t  gdx = get_global_id(0);
 
-	uint maxUint = 0;
-	maxUint--;
-	float r = (float)maxUint;
-
-	threefry4x32_ctr_t ctr = ctr_i;
-	threefry4x32_ukey_t ukey;
-
-	ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+        uint maxUint = 0;
+        maxUint--;
+        float r = (float)maxUint;
 
-	threefry4x32_ctr_t random4;
+        threefry4x32_ctr_t      ctr = ctr_i; 
+        threefry4x32_ukey_t ukey;
 
-	if ( gdx < numrandom )
-	{
-		random4 = threefry4x32_R(nrounds, ctr, ukey);
-		uint4 frnd;
+        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
 
-		frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-		frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-		frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-		frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+        threefry4x32_ctr_t  random4;
 
-		randomnumber[gdx] = frnd;
-	}
+        if ( gdx < numrandom )
+        {
+                random4 = threefry4x32_R(nrounds, ctr, ukey);
+                uint4 frnd;
+				
+                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+				
+                randomnumber[gdx] = frnd;
+        }
 }
 
+
 template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
 
 template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
@@ -885,8 +792,8 @@ template <class T>
 __kernel void PRNG_threefry4x32_gaussian(
 	__global float4 *randomnumber, 
 	threefry4x32_ctr_t ctr_i,
-	float E,
-	float V,
+	T E,
+	T V,
 	uint nrounds,
 	uint numrandom
 ){
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index a3207f6c..3275d75c 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -901,6 +901,7 @@ void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
 
 template <>
 void caffe_gpu_rng_gaussian<float>(const int n, const float mu, const float sigma,
+                                  float* r) {
 	caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
 }
 

From 900beb88a042d90cdadbf52b87766282d03ab89e Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Thu, 10 Sep 2015 20:43:47 +0800
Subject: [PATCH 094/124] Add uint random generator

---
 src/caffe/layers/dropout_layer.cpp | 20 +++++++++++++++++++
 src/caffe/ocl/random.cl            | 31 ++++++++++++++++++++++++++++++
 src/caffe/util/ocl_wrapper.cpp     | 27 ++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index c84c8622..de8f5607 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -77,6 +77,23 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+#define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\
+do{ \
+  int *global_mem_cpu = new int[count]; \
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \
+              CL_TRUE, 0, sizeof(int)*count, global_mem_cpu,0, NULL, NULL); \
+  size_t sample_interval = count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<count; i+=sample_interval){ \
+      printf("%d  ", global_mem_cpu[i]); \
+  } \
+  printf("\n\n"); \
+  delete []global_mem_cpu; \
+}while(0)
+
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -101,6 +118,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	} else {
 		caffe_gpu_copy(count, bottom_data, top_data);
 	}
+CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
 }
 
 template <typename Dtype>
@@ -117,6 +135,8 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		} else {
 			caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
 		}
+               CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
+               CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff");
 	}
 }
 
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index 058f41d7..438931ec 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -788,6 +788,37 @@ template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_thre
 
 template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm);
 
+
+__kernel void PRNG_threefry4x32_uint_uniform(
+        __global uint4 *randomnumber,
+        threefry4x32_ctr_t ctr_i,
+        uint inf,
+        uint sup,
+        uint nrounds,
+        uint numrandom
+){
+        size_t  gdx = get_global_id(0);
+
+        threefry4x32_ctr_t      ctr = ctr_i; 
+        threefry4x32_ukey_t ukey;
+
+        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+        threefry4x32_ctr_t  random4;
+
+        if ( gdx < numrandom )
+        {
+                random4 = threefry4x32_R(nrounds, ctr, ukey);
+                uint4 frnd;
+                frnd.x =  random4.v[0] % (sup - inf) + inf;
+                frnd.y =  random4.v[1] % (sup - inf) + inf;
+                frnd.z =  random4.v[2] % (sup - inf) + inf;
+                frnd.w =  random4.v[3] % (sup - inf) + inf;
+                randomnumber[gdx] = frnd;
+        }
+}
+
+
 template <class T>
 __kernel void PRNG_threefry4x32_gaussian(
 	__global float4 *randomnumber, 
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 20535868..75b69215 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -145,6 +145,33 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
 template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup);
 template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup);
 
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r)
+{
+        std::string kernel_name = "PRNG_threefry4x32_uint_uniform";
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+        
+        cl_uint inf = 0;
+        cl_uint sup = UINT_MAX;
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&r);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+
 template <typename Dtype>
 void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V)
 {

From 4adb3d25c5379a0118d4f323394543af1832f485 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Thu, 10 Sep 2015 08:47:08 -0700
Subject: [PATCH 095/124] Update README.md

---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2c9a0ef1..faf725dd 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 #OpenCL caffe
 
-This is an OpenCL implementation of the popular caffe DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
+This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
+
+OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language is supported by more than 20 companies, enabling this DNN framework to be used on heterogeneous platforms from a variety of  commercial chip manufacturers. 
 
 #Design features
   -All layers ported to OpenCL
 
-  -Performance improvement by batched sgemm implementation for conv layer
+  -Performance improvement by batched implementation for conv layer based on clBLAS
 
   -User can choose optimal batch number depending on H/W, image size and minibatch size
 
@@ -13,7 +15,7 @@ This is an OpenCL implementation of the popular caffe DNN framework (https://git
   
   -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users
 
-  -Users can directly run DNN models: AlexNet, VGG 16 and VGG-19
+  -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19
 
 Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future.
 
@@ -38,10 +40,10 @@ For more information on how to install, use or contribute to this code base, ple
 https://github.com/amd/OpenCL-caffe/wiki
 
 #Support needed
-We encourage the contribution and support from the community to improve it together.
+ As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together.
 
 #License
-Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
+Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
 
 # Original Caffe information
 ## Caffe

From d2a24e6815e1cdd87d9d969e0c9c4aff57f8cb31 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Thu, 10 Sep 2015 08:51:09 -0700
Subject: [PATCH 096/124] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index faf725dd..dd3933e6 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,10 @@
 
 This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
 
-OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language is supported by more than 20 companies, enabling this DNN framework to be used on heterogeneous platforms from a variety of  commercial chip manufacturers. 
+OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. 
 
 #Design features
-  -All layers ported to OpenCL
+  -All caffe layers ported to OpenCL
 
   -Performance improvement by batched implementation for conv layer based on clBLAS
 
@@ -37,13 +37,13 @@ We will keep updating the latest performance as we make optimizations. Fury resu
 
 #Wiki
 For more information on how to install, use or contribute to this code base, please visit our wiki page:
-https://github.com/amd/OpenCL-caffe/wiki
+ https://github.com/amd/OpenCL-caffe/wiki
 
 #Support needed
  As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together.
 
 #License
-Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or which ever your preferred licence.
+Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license.
 
 # Original Caffe information
 ## Caffe

From 280f8139dce78e5c239ad32569199934bb88d562 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Thu, 10 Sep 2015 16:20:04 -0700
Subject: [PATCH 097/124] add notation

---
 src/caffe/layers/absval_layer.cpp           |   1 +
 src/caffe/layers/base_conv_layer.cpp        | 114 ++++++++++----------
 src/caffe/layers/bnll_layer.cpp             |   1 +
 src/caffe/layers/concat_layer.cpp           |   1 +
 src/caffe/layers/contrastive_loss_layer.cpp |   1 +
 src/caffe/layers/conv_layer.cpp             |   2 +
 src/caffe/layers/dropout_layer.cpp          |   4 +-
 src/caffe/layers/eltwise_layer.cpp          |   1 +
 src/caffe/layers/euclidean_loss_layer.cpp   |   1 +
 src/caffe/layers/exp_layer.cpp              |   1 +
 src/caffe/layers/filter_layer.cpp           |   1 +
 src/caffe/layers/hdf5_data_layer.cpp        |   1 +
 src/caffe/layers/hdf5_output_layer.cpp      |   1 +
 src/caffe/layers/pooling_layer.cpp          |   2 +
 src/caffe/layers/power_layer.cpp            |   3 +-
 src/caffe/layers/softmax_layer.cpp          |   4 +-
 src/caffe/layers/softmax_loss_layer.cpp     |   3 +-
 src/caffe/layers/split_layer.cpp            |   4 +-
 src/caffe/syncedmem.cpp                     |   8 +-
 19 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 85faa8d3..5dc99b75 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -35,6 +35,7 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index cefa8a66..149b1a21 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -295,6 +295,65 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
 	}
 }
 
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
+		const Dtype* bias) {
+	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
+			height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
+			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
+			(Dtype) 1., output, top_offset_);
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
+		const Dtype* weights, Dtype* input) {
+	Dtype* col_buff = col_buffer_.mutable_gpu_data();
+	if (is_1x1_) {
+		col_buff = input;
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+				> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+						/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
+						(Dtype) 1., weights, weight_offset_ * g,
+						output, top_offset_ + output_offset_ * g,
+						(Dtype) 0., col_buff, col_offset_ * g);
+	}
+	if (!is_1x1_) {
+		conv_col2im_gpu(col_buff, input);
+	}
+}
+
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
+		const Dtype* output, Dtype* weights) {
+	const Dtype* col_buff = input;
+	if (!is_1x1_) {
+		conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+		col_buff = col_buffer_.gpu_data();
+	}
+	for (int g = 0; g < group_; ++g) {
+		caffe_gpu_gemm < Dtype
+				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
+						/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
+						(Dtype) 1., output, top_offset_,
+						(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
+						(Dtype*) weights, weight_offset_ * g);
+	}
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
+		const Dtype* input) {
+	caffe_gpu_gemv < Dtype
+			> (CblasNoTrans, num_output_, N_,
+					(Dtype) 1., input, top_offset_, N_,
+					reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
+					bias, (size_t) 0, 1);
+}
+
+// begin: code written/modified by AMD
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
 		const Dtype* weight, Dtype* output, bool skip_im2col) {
@@ -335,14 +394,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
 			opt_num2);
 }
 
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
-		const Dtype* bias) {
-	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-			height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
-			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-			(Dtype) 1., output, top_offset_);
-}
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
@@ -354,25 +405,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
 				(Dtype) 1., output, top_offset_ + num_output_ * N_ * z);
 }
 
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
-		const Dtype* weights, Dtype* input) {
-	Dtype* col_buff = col_buffer_.mutable_gpu_data();
-	if (is_1x1_) {
-		col_buff = input;
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
-						/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
-						(Dtype) 1., weights, weight_offset_ * g,
-						output, top_offset_ + output_offset_ * g,
-						(Dtype) 0., col_buff, col_offset_ * g);
-	}
-	if (!is_1x1_) {
-		conv_col2im_gpu(col_buff, input);
-	}
-}
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
@@ -412,23 +444,6 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
 	}
 }
 
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
-		const Dtype* output, Dtype* weights) {
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-		col_buff = col_buffer_.gpu_data();
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
-						/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
-						(Dtype) 1., output, top_offset_,
-						(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
-						(Dtype*) weights, weight_offset_ * g);
-	}
-}
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 		const Dtype* output, Dtype* weights) {
@@ -463,16 +478,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
 	}
 }
 
-template <typename Dtype>
-void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
-		const Dtype* input) {
-	caffe_gpu_gemv < Dtype
-			> (CblasNoTrans, num_output_, N_,
-					(Dtype) 1., input, top_offset_, N_,
-					reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
-					bias, (size_t) 0, 1);
-}
-
+// end: code is written/modified by AMD
 #endif  // !CPU_ONLY
 
 INSTANTIATE_CLASS (BaseConvolutionLayer);
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 11b78a15..ad422a11 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -39,6 +39,7 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 7d55ef40..28aac6b2 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -92,6 +92,7 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 6a91fdfd..f6265726 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -111,6 +111,7 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
 		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index bbe07f37..0a989f69 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -69,6 +69,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -234,6 +235,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
 	}
 
 }
+// end: code written/modified by AMD
 
 #ifdef CPU_ONLY
 STUB_GPU(ConvolutionLayer);
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index de8f5607..6692f238 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -94,6 +94,7 @@ do{ \
   delete []global_mem_cpu; \
 }while(0)
 
+// begin: code is written/modified by AMD
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -116,6 +117,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 				top_data);
 #endif
 	} else {
+             if(bottom_data != top_data)
 		caffe_gpu_copy(count, bottom_data, top_data);
 	}
 CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
@@ -139,7 +141,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
                CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff");
 	}
 }
-
+// end: code is written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(DropoutLayer);
 #endif
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index e7b97b0d..b904ad39 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -153,6 +153,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 56dc48ec..9107f119 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -47,6 +47,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index bf783786..087da677 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -61,6 +61,7 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index f7096a09..05dc2783 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -117,6 +117,7 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 6f67dc06..6c6d8dec 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -158,6 +158,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index baad0dea..a8c062bc 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -67,6 +67,7 @@ void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	return;
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 92c71582..47830228 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -309,6 +309,7 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -407,6 +408,7 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// end: code written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(PoolingLayer);
 #endif
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 93ef9e1f..0cf82c35 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -96,6 +96,7 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -168,7 +169,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 	}
 }
-
+// end: code written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(PowerLayer);
 #endif
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index d4cab577..feb15321 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -91,7 +91,7 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	// elementwise multiplication
 	caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
-
+// begin: code written/modified by AMD
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
@@ -148,7 +148,7 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 	caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
 }
-
+// end: code written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxLayer);
 #endif
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 86a0d37a..6b9e9e67 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -133,6 +133,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
 		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -198,7 +199,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		}
 	}
 }
-
+// end: code written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 8b19d293..54bea0d6 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -61,6 +61,7 @@ void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 	}
 }
 
+// begin: code written/modified by AMD
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
@@ -79,9 +80,8 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
 		caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
 	}
-
 }
-
+// end: code written/modified by AMD
 #ifdef CPU_ONLY
 STUB_GPU(SplitLayer);
 #endif
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 67f5984b..976130bf 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -52,6 +52,7 @@ SyncedMemory::~SyncedMemory() {
 	clReleaseKernel (oclmem_kernel);
 }
 
+//begin: code written/modified by AMD.
 void SyncedMemory::ocl_setup() {
 	cl_int err = 0;
 	oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
@@ -63,7 +64,6 @@ inline void SyncedMemory::to_cpu() {
 		case UNINITIALIZED:
 			gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
 					size_, NULL, NULL);
-			//}
 			cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
 					(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
 					size_,
@@ -89,9 +89,6 @@ inline void SyncedMemory::to_cpu() {
 			head_ = SYNCED;
 #else
 			NO_GPU;
-#endif
-#ifdef Track_data_transfer
-			LOG(WARNING) << "sync: data from GPU to CPU";
 #endif
 			break;
 		}
@@ -130,9 +127,6 @@ inline void SyncedMemory::to_gpu() {
 							(cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
 			clFinish(amdDevice.CommandQueue);
 			head_ = SYNCED;
-#ifdef Track_data_transfer
-			LOG(WARNING) << "sync: data from CPU to GPU";
-#endif
 			break;
 		}
 		case HEAD_AT_GPU:

From 30d5f21c70d16f446cdcae233683789cc996e022 Mon Sep 17 00:00:00 2001
From: Noplz <yuan.gao@noplz.name>
Date: Fri, 11 Sep 2015 13:59:17 +0800
Subject: [PATCH 098/124] Adjust the indent

---
 include/caffe/blob.hpp                        |  501 ++-
 include/caffe/common.hpp                      |  128 +-
 include/caffe/common_layers.hpp               | 1113 ++++---
 include/caffe/data_layers.hpp                 |  636 ++--
 include/caffe/data_transformer.hpp            |  256 +-
 include/caffe/device.hpp                      |   77 +-
 include/caffe/filler.hpp                      |  348 +-
 include/caffe/internal_thread.hpp             |   33 +-
 include/caffe/layer.hpp                       |  850 +++--
 include/caffe/layer_factory.hpp               |  102 +-
 include/caffe/loss_layers.hpp                 | 1094 +++----
 include/caffe/net.hpp                         |  484 +--
 include/caffe/neuron_layers.hpp               | 1342 ++++----
 include/caffe/python_layer.hpp                |   95 +-
 include/caffe/solver.hpp                      |  288 +-
 include/caffe/syncedmem.hpp                   |   98 +-
 include/caffe/test/test_caffe_main.hpp        |   39 +-
 .../caffe/test/test_gradient_check_util.hpp   |  438 +--
 include/caffe/util/benchmark.hpp              |   80 +-
 include/caffe/util/cudnn.hpp                  |  212 +-
 include/caffe/util/db.hpp                     |   66 +-
 include/caffe/util/db_leveldb.hpp             |  131 +-
 include/caffe/util/db_lmdb.hpp                |  149 +-
 include/caffe/util/im2col.hpp                 |   84 +-
 include/caffe/util/insert_splits.hpp          |    8 +-
 include/caffe/util/io.hpp                     |  116 +-
 include/caffe/util/math_functions.hpp         |   79 +-
 include/caffe/util/mkl_alternate.hpp          |   14 +-
 include/caffe/util/ocl_util.hpp               |    2 +-
 include/caffe/util/ocl_wrapper.hpp            |  246 +-
 include/caffe/util/rng.hpp                    |   29 +-
 include/caffe/util/upgrade_proto.hpp          |   10 +-
 include/caffe/vision_layers.hpp               |  961 +++---
 src/caffe/blob.cpp                            |  650 ++--
 src/caffe/common.cpp                          |  147 +-
 src/caffe/data_transformer.cpp                |  928 +++---
 src/caffe/device.cpp                          |  712 ++--
 src/caffe/internal_thread.cpp                 |   40 +-
 src/caffe/layer_factory.cpp                   |  179 +-
 src/caffe/layers/absval_layer.cpp             |   62 +-
 src/caffe/layers/accuracy_layer.cpp           |  124 +-
 src/caffe/layers/argmax_layer.cpp             |   73 +-
 src/caffe/layers/base_conv_layer.cpp          |  721 ++--
 src/caffe/layers/base_data_layer.cpp          |  150 +-
 src/caffe/layers/bnll_layer.cpp               |   76 +-
 src/caffe/layers/concat_layer.cpp             |  214 +-
 src/caffe/layers/contrastive_loss_layer.cpp   |  278 +-
 src/caffe/layers/conv_layer.cpp               |  369 +--
 src/caffe/layers/data_layer.cpp               |  176 +-
 src/caffe/layers/deconv_layer.cpp             |  194 +-
 src/caffe/layers/dropout_layer.cpp            |  161 +-
 src/caffe/layers/dummy_data_layer.cpp         |  189 +-
 src/caffe/layers/eltwise_layer.cpp            |  420 +--
 src/caffe/layers/euclidean_loss_layer.cpp     |   98 +-
 src/caffe/layers/exp_layer.cpp                |  134 +-
 src/caffe/layers/filter_layer.cpp             |  286 +-
 src/caffe/layers/flatten_layer.cpp            |   40 +-
 src/caffe/layers/hdf5_data_layer.cpp          |  311 +-
 src/caffe/layers/hdf5_output_layer.cpp        |  122 +-
 src/caffe/layers/hinge_loss_layer.cpp         |  110 +-
 src/caffe/layers/im2col_layer.cpp             |  158 +-
 src/caffe/layers/image_data_layer.cpp         |  243 +-
 src/caffe/layers/infogain_loss_layer.cpp      |  159 +-
 src/caffe/layers/inner_product_layer.cpp      |  257 +-
 src/caffe/layers/log_layer.cpp                |  198 +-
 src/caffe/layers/loss_layer.cpp               |   24 +-
 src/caffe/layers/lrn_layer.cpp                |  509 +--
 src/caffe/layers/memory_data_layer.cpp        |  166 +-
 .../multinomial_logistic_loss_layer.cpp       |   77 +-
 src/caffe/layers/mvn_layer.cpp                |  424 ++-
 src/caffe/layers/neuron_layer.cpp             |    4 +-
 src/caffe/layers/pooling_layer.cpp            |  736 ++---
 src/caffe/layers/power_layer.cpp              |  277 +-
 src/caffe/layers/prelu_layer.cpp              |  337 +-
 src/caffe/layers/reduction_layer.cpp          |  352 +-
 src/caffe/layers/relu_layer.cpp               |   74 +-
 src/caffe/layers/reshape_layer.cpp            |  154 +-
 .../sigmoid_cross_entropy_loss_layer.cpp      |  131 +-
 src/caffe/layers/sigmoid_layer.cpp            |   69 +-
 src/caffe/layers/silence_layer.cpp            |   31 +-
 src/caffe/layers/slice_layer.cpp              |  181 +-
 src/caffe/layers/softmax_layer.cpp            |  226 +-
 src/caffe/layers/softmax_loss_layer.cpp       |  314 +-
 src/caffe/layers/split_layer.cpp              |  110 +-
 src/caffe/layers/spp_layer.cpp                |  303 +-
 src/caffe/layers/tanh_layer.cpp               |   69 +-
 src/caffe/layers/threshold_layer.cpp          |   32 +-
 src/caffe/layers/window_data_layer.cpp        |  768 +++--
 src/caffe/net.cpp                             | 1507 +++++----
 src/caffe/ocl/bnll_layer.cl                   |   24 +-
 src/caffe/ocl/concat_layer.cl                 |   44 +-
 src/caffe/ocl/contrastive_loss_layer.cl       |   64 +-
 src/caffe/ocl/dropout_layer.cl                |   12 +-
 src/caffe/ocl/eltwise_layer.cl                |   72 +-
 src/caffe/ocl/im2col.cl                       |  398 +--
 src/caffe/ocl/lrn_layer.cl                    |  190 +-
 src/caffe/ocl/pooling_layer.cl                |  446 +--
 src/caffe/ocl/prelu_layer.cl                  |   34 +-
 src/caffe/ocl/random.cl                       | 1720 +++++-----
 src/caffe/ocl/relu_layer.cl                   |   14 +-
 src/caffe/ocl/sigmoid_layer.cl                |   14 +-
 src/caffe/ocl/softmax_layer.cl                |  192 +-
 src/caffe/ocl/softmaxwithloss_layer.cl        |  112 +-
 src/caffe/ocl/tanh_layer.cl                   |   14 +-
 src/caffe/ocl/threshold_layer.cl              |    6 +-
 src/caffe/ocl/util.cl                         |  160 +-
 src/caffe/solver.cpp                          | 1309 ++++----
 src/caffe/syncedmem.cpp                       |  207 +-
 src/caffe/util/benchmark.cpp                  |  141 +-
 src/caffe/util/cudnn.cpp                      |   28 +-
 src/caffe/util/db.cpp                         |   30 +-
 src/caffe/util/db_leveldb.cpp                 |   20 +-
 src/caffe/util/db_lmdb.cpp                    |   54 +-
 src/caffe/util/im2col.cpp                     |  570 ++--
 src/caffe/util/im2col.cu                      |  197 +-
 src/caffe/util/insert_splits.cpp              |  241 +-
 src/caffe/util/io.cpp                         |  414 ++-
 src/caffe/util/math_functions.cpp             |  948 +++---
 src/caffe/util/math_functions.cu              |  209 +-
 src/caffe/util/ocl_util.cpp                   |   72 +-
 src/caffe/util/ocl_wrapper.cpp                | 2903 ++++++++---------
 src/caffe/util/upgrade_proto.cpp              | 1761 +++++-----
 122 files changed, 18665 insertions(+), 18918 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 26a75558..9f22a082 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -23,277 +23,276 @@ namespace caffe {
  */
 template <typename Dtype>
 class Blob {
-	public:
-		Blob()
-		:
-				data_(), diff_(), count_(0), capacity_(0) {
-		}
+  public:
+    Blob()
+        : data_(), diff_(), count_(0), capacity_(0) {
+    }
 
-		/// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
-		explicit Blob(const int num, const int channels, const int height,
-				const int width);
-		explicit Blob(const vector<int>& shape);
+    /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
+    explicit Blob(const int num, const int channels, const int height,
+        const int width);
+    explicit Blob(const vector<int>& shape);
 
-		/// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
-		void Reshape(const int num, const int channels, const int height,
-				const int width);
-		/**
-		 * @brief Change the dimensions of the blob, allocating new memory if
-		 *        necessary.
-		 *
-		 * This function can be called both to create an initial allocation
-		 * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
-		 * or Layer::Forward. When changing the size of blob, memory will only be
-		 * reallocated if sufficient memory does not already exist, and excess memory
-		 * will never be freed.
-		 *
-		 * Note that reshaping an input blob and immediately calling Net::Backward is
-		 * an error; either Net::Forward or Net::Reshape need to be called to
-		 * propagate the new input shape to higher layers.
-		 */
-		void Reshape(const vector<int>& shape);
-		void Reshape(const BlobShape& shape);
-		void ReshapeLike(const Blob& other);
-		inline string shape_string() const {
-			ostringstream stream;
-			for (int i = 0; i < shape_.size(); ++i) {
-				stream << shape_[i] << " ";
-			}
-			stream << "(" << count_ << ")";
-			return stream.str();
-		}
-		inline const vector<int>& shape() const {
-			return shape_;
-		}
-		/**
-		 * @brief Returns the dimension of the index-th axis (or the negative index-th
-		 *        axis from the end, if index is negative).
-		 *
-		 * @param index the axis index, which may be negative as it will be
-		 *        "canonicalized" using CanonicalAxisIndex.
-		 *        Dies on out of range index.
-		 */
-		inline int shape(int index) const {
-			return shape_[CanonicalAxisIndex(index)];
-		}
-		inline int num_axes() const {
-			return shape_.size();
-		}
-		inline int count() const {
-			return count_;
-		}
+    /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
+    void Reshape(const int num, const int channels, const int height,
+        const int width);
+    /**
+     * @brief Change the dimensions of the blob, allocating new memory if
+     *        necessary.
+     *
+     * This function can be called both to create an initial allocation
+     * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
+     * or Layer::Forward. When changing the size of blob, memory will only be
+     * reallocated if sufficient memory does not already exist, and excess memory
+     * will never be freed.
+     *
+     * Note that reshaping an input blob and immediately calling Net::Backward is
+     * an error; either Net::Forward or Net::Reshape need to be called to
+     * propagate the new input shape to higher layers.
+     */
+    void Reshape(const vector<int>& shape);
+    void Reshape(const BlobShape& shape);
+    void ReshapeLike(const Blob& other);
+    inline string shape_string() const {
+      ostringstream stream;
+      for (int i = 0; i < shape_.size(); ++i) {
+        stream << shape_[i] << " ";
+      }
+      stream << "(" << count_ << ")";
+      return stream.str();
+    }
+    inline const vector<int>& shape() const {
+      return shape_;
+    }
+    /**
+     * @brief Returns the dimension of the index-th axis (or the negative index-th
+     *        axis from the end, if index is negative).
+     *
+     * @param index the axis index, which may be negative as it will be
+     *        "canonicalized" using CanonicalAxisIndex.
+     *        Dies on out of range index.
+     */
+    inline int shape(int index) const {
+      return shape_[CanonicalAxisIndex(index)];
+    }
+    inline int num_axes() const {
+      return shape_.size();
+    }
+    inline int count() const {
+      return count_;
+    }
 
-		/**
-		 * @brief Compute the volume of a slice; i.e., the product of dimensions
-		 *        among a range of axes.
-		 *
-		 * @param start_axis The first axis to include in the slice.
-		 *
-		 * @param end_axis The first axis to exclude from the slice.
-		 */
-		inline int count(int start_axis, int end_axis) const {
-			CHECK_LE(start_axis, end_axis);
-			CHECK_GE(start_axis, 0);
-			CHECK_GE(end_axis, 0);
-			CHECK_LE(start_axis, num_axes());
-			CHECK_LE(end_axis, num_axes());
-			int count = 1;
-			for (int i = start_axis; i < end_axis; ++i) {
-				count *= shape(i);
-			}
-			return count;
-		}
-		/**
-		 * @brief Compute the volume of a slice spanning from a particular first
-		 *        axis to the final axis.
-		 *
-		 * @param start_axis The first axis to include in the slice.
-		 */
-		inline int count(int start_axis) const {
-			return count(start_axis, num_axes());
-		}
+    /**
+     * @brief Compute the volume of a slice; i.e., the product of dimensions
+     *        among a range of axes.
+     *
+     * @param start_axis The first axis to include in the slice.
+     *
+     * @param end_axis The first axis to exclude from the slice.
+     */
+    inline int count(int start_axis, int end_axis) const {
+      CHECK_LE(start_axis, end_axis);
+      CHECK_GE(start_axis, 0);
+      CHECK_GE(end_axis, 0);
+      CHECK_LE(start_axis, num_axes());
+      CHECK_LE(end_axis, num_axes());
+      int count = 1;
+      for (int i = start_axis; i < end_axis; ++i) {
+        count *= shape(i);
+      }
+      return count;
+    }
+    /**
+     * @brief Compute the volume of a slice spanning from a particular first
+     *        axis to the final axis.
+     *
+     * @param start_axis The first axis to include in the slice.
+     */
+    inline int count(int start_axis) const {
+      return count(start_axis, num_axes());
+    }
 
-		/**
-		 * @brief Returns the 'canonical' version of a (usually) user-specified axis,
-		 *        allowing for negative indexing (e.g., -1 for the last axis).
-		 *
-		 * @param index the axis index.
-		 *        If 0 <= index < num_axes(), return index.
-		 *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
-		 *        e.g., the last axis index (num_axes() - 1) if index == -1,
-		 *        the second to last if index == -2, etc.
-		 *        Dies on out of range index.
-		 */
-		inline int CanonicalAxisIndex(int axis_index) const {
-			CHECK_GE(axis_index, -num_axes())
-					<< "axis " << axis_index << " out of range for " << num_axes()
-					<< "-D Blob with shape " << shape_string();
-			CHECK_LT(axis_index, num_axes())
-					<< "axis " << axis_index << " out of range for " << num_axes()
-					<< "-D Blob with shape " << shape_string();
-			if (axis_index < 0) {
-				return axis_index + num_axes();
-			}
-			return axis_index;
-		}
+    /**
+     * @brief Returns the 'canonical' version of a (usually) user-specified axis,
+     *        allowing for negative indexing (e.g., -1 for the last axis).
+     *
+     * @param index the axis index.
+     *        If 0 <= index < num_axes(), return index.
+     *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
+     *        e.g., the last axis index (num_axes() - 1) if index == -1,
+     *        the second to last if index == -2, etc.
+     *        Dies on out of range index.
+     */
+    inline int CanonicalAxisIndex(int axis_index) const {
+      CHECK_GE(axis_index, -num_axes()) << "axis " << axis_index
+          << " out of range for " << num_axes() << "-D Blob with shape "
+          << shape_string();
+      CHECK_LT(axis_index, num_axes()) << "axis " << axis_index
+          << " out of range for " << num_axes() << "-D Blob with shape "
+          << shape_string();
+      if (axis_index < 0) {
+        return axis_index + num_axes();
+      }
+      return axis_index;
+    }
 
-		/// @brief Deprecated legacy shape accessor num: use shape(0) instead.
-		inline int num() const {
-			return LegacyShape(0);
-		}
-		/// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
-		inline int channels() const {
-			return LegacyShape(1);
-		}
-		/// @brief Deprecated legacy shape accessor height: use shape(2) instead.
-		inline int height() const {
-			return LegacyShape(2);
-		}
-		/// @brief Deprecated legacy shape accessor width: use shape(3) instead.
-		inline int width() const {
-			return LegacyShape(3);
-		}
-		inline int LegacyShape(int index) const {
-			CHECK_LE(num_axes(), 4)
-					<< "Cannot use legacy accessors on Blobs with > 4 axes.";
-			CHECK_LT(index, 4);
-			CHECK_GE(index, -4);
-			if (index >= num_axes() || index < -num_axes()) {
-				// Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
-				// indexing) -- this special case simulates the one-padding used to fill
-				// extraneous axes of legacy blobs.
-				return 1;
-			}
-			return shape(index);
-		}
+    /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+    inline int num() const {
+      return LegacyShape(0);
+    }
+    /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+    inline int channels() const {
+      return LegacyShape(1);
+    }
+    /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+    inline int height() const {
+      return LegacyShape(2);
+    }
+    /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+    inline int width() const {
+      return LegacyShape(3);
+    }
+    inline int LegacyShape(int index) const {
+      CHECK_LE(num_axes(), 4)
+          << "Cannot use legacy accessors on Blobs with > 4 axes.";
+      CHECK_LT(index, 4);
+      CHECK_GE(index, -4);
+      if (index >= num_axes() || index < -num_axes()) {
+        // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+        // indexing) -- this special case simulates the one-padding used to fill
+        // extraneous axes of legacy blobs.
+        return 1;
+      }
+      return shape(index);
+    }
 
-		inline int offset(const int n, const int c = 0, const int h = 0,
-				const int w = 0) const {
-			CHECK_GE(n, 0);
-			CHECK_LE(n, num());
-			CHECK_GE(channels(), 0);
-			CHECK_LE(c, channels());
-			CHECK_GE(height(), 0);
-			CHECK_LE(h, height());
-			CHECK_GE(width(), 0);
-			CHECK_LE(w, width());
-			return ((n * channels() + c) * height() + h) * width() + w;
-		}
+    inline int offset(const int n, const int c = 0, const int h = 0,
+        const int w = 0) const {
+      CHECK_GE(n, 0);
+      CHECK_LE(n, num());
+      CHECK_GE(channels(), 0);
+      CHECK_LE(c, channels());
+      CHECK_GE(height(), 0);
+      CHECK_LE(h, height());
+      CHECK_GE(width(), 0);
+      CHECK_LE(w, width());
+      return ((n * channels() + c) * height() + h) * width() + w;
+    }
 
-		inline int offset(const vector<int>& indices) const {
-			CHECK_LE(indices.size(), num_axes());
-			int offset = 0;
-			for (int i = 0; i < num_axes(); ++i) {
-				offset *= shape(i);
-				if (indices.size() > i) {
-					CHECK_GE(indices[i], 0);
-					CHECK_LT(indices[i], shape(i));
-					offset += indices[i];
-				}
-			}
-			return offset;
-		}
-		/**
-		 * @brief Copy from a source Blob.
-		 *
-		 * @param source the Blob to copy from
-		 * @param copy_diff if false, copy the data; if true, copy the diff
-		 * @param reshape if false, require this Blob to be pre-shaped to the shape
-		 *        of other (and die otherwise); if true, Reshape this Blob to other's
-		 *        shape if necessary
-		 */
-		void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
-				bool reshape = false);
+    inline int offset(const vector<int>& indices) const {
+      CHECK_LE(indices.size(), num_axes());
+      int offset = 0;
+      for (int i = 0; i < num_axes(); ++i) {
+        offset *= shape(i);
+        if (indices.size() > i) {
+          CHECK_GE(indices[i], 0);
+          CHECK_LT(indices[i], shape(i));
+          offset += indices[i];
+        }
+      }
+      return offset;
+    }
+    /**
+     * @brief Copy from a source Blob.
+     *
+     * @param source the Blob to copy from
+     * @param copy_diff if false, copy the data; if true, copy the diff
+     * @param reshape if false, require this Blob to be pre-shaped to the shape
+     *        of other (and die otherwise); if true, Reshape this Blob to other's
+     *        shape if necessary
+     */
+    void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
+        bool reshape = false);
 
-		inline Dtype data_at(const int n, const int c, const int h,
-				const int w) const {
-			return cpu_data()[offset(n, c, h, w)];
-		}
+    inline Dtype data_at(const int n, const int c, const int h,
+        const int w) const {
+      return cpu_data()[offset(n, c, h, w)];
+    }
 
-		inline Dtype diff_at(const int n, const int c, const int h,
-				const int w) const {
-			return cpu_diff()[offset(n, c, h, w)];
-		}
+    inline Dtype diff_at(const int n, const int c, const int h,
+        const int w) const {
+      return cpu_diff()[offset(n, c, h, w)];
+    }
 
-		inline Dtype data_at(const vector<int>& index) const {
-			return cpu_data()[offset(index)];
-		}
+    inline Dtype data_at(const vector<int>& index) const {
+      return cpu_data()[offset(index)];
+    }
 
-		inline Dtype diff_at(const vector<int>& index) const {
-			return cpu_diff()[offset(index)];
-		}
+    inline Dtype diff_at(const vector<int>& index) const {
+      return cpu_diff()[offset(index)];
+    }
 
-		inline const shared_ptr<SyncedMemory>& data() const {
-			CHECK(data_);
-			return data_;
-		}
+    inline const shared_ptr<SyncedMemory>& data() const {
+      CHECK(data_);
+      return data_;
+    }
 
-		inline const shared_ptr<SyncedMemory>& diff() const {
-			CHECK(diff_);
-			return diff_;
-		}
+    inline const shared_ptr<SyncedMemory>& diff() const {
+      CHECK(diff_);
+      return diff_;
+    }
 
-		const Dtype* cpu_data() const;
-		void set_cpu_data(Dtype* data);
-		const Dtype* gpu_data() const;
-		const Dtype* gpu_cache_data() const;
-		const Dtype* cpu_diff() const;
-		const Dtype* gpu_diff() const;
-		Dtype* mutable_cpu_data();
-		Dtype* mutable_gpu_data();
-		Dtype* mutable_cpu_diff();
-		Dtype* mutable_gpu_diff();
-		void Update();
-		void FromProto(const BlobProto& proto, bool reshape = true);
-		void ToProto(BlobProto* proto, bool write_diff = false) const;
+    const Dtype* cpu_data() const;
+    void set_cpu_data(Dtype* data);
+    const Dtype* gpu_data() const;
+    const Dtype* gpu_cache_data() const;
+    const Dtype* cpu_diff() const;
+    const Dtype* gpu_diff() const;
+    Dtype* mutable_cpu_data();
+    Dtype* mutable_gpu_data();
+    Dtype* mutable_cpu_diff();
+    Dtype* mutable_gpu_diff();
+    void Update();
+    void FromProto(const BlobProto& proto, bool reshape = true);
+    void ToProto(BlobProto* proto, bool write_diff = false) const;
 
-		/// @brief Compute the sum of absolute values (L1 norm) of the data.
-		Dtype asum_data() const;
-		/// @brief Compute the sum of absolute values (L1 norm) of the diff.
-		Dtype asum_diff() const;
-		/// @brief Compute the sum of squares (L2 norm squared) of the data.
-		Dtype sumsq_data() const;
-		/// @brief Compute the sum of squares (L2 norm squared) of the diff.
-		Dtype sumsq_diff() const;
+    /// @brief Compute the sum of absolute values (L1 norm) of the data.
+    Dtype asum_data() const;
+    /// @brief Compute the sum of absolute values (L1 norm) of the diff.
+    Dtype asum_diff() const;
+    /// @brief Compute the sum of squares (L2 norm squared) of the data.
+    Dtype sumsq_data() const;
+    /// @brief Compute the sum of squares (L2 norm squared) of the diff.
+    Dtype sumsq_diff() const;
 
-		/// @brief Scale the blob data by a constant factor.
-		void scale_data(Dtype scale_factor);
-		/// @brief Scale the blob diff by a constant factor.
-		void scale_diff(Dtype scale_factor);
+    /// @brief Scale the blob data by a constant factor.
+    void scale_data(Dtype scale_factor);
+    /// @brief Scale the blob diff by a constant factor.
+    void scale_diff(Dtype scale_factor);
 
-		/**
-		 * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
-		 *        data_ of Blob other -- useful in Layer%s which simply perform a copy
-		 *        in their Forward pass.
-		 *
-		 * This deallocates the SyncedMemory holding this Blob's data_, as
-		 * shared_ptr calls its destructor when reset with the "=" operator.
-		 */
-		void ShareData(const Blob& other);
-		/**
-		 * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
-		 *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
-		 *        in their Forward pass.
-		 *
-		 * This deallocates the SyncedMemory holding this Blob's diff_, as
-		 * shared_ptr calls its destructor when reset with the "=" operator.
-		 */
-		void ShareDiff(const Blob& other);
-		void set_data_layer() {
-			data_->set_data_layer();
-			diff_->set_data_layer();
-		}
+    /**
+     * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
+     *        data_ of Blob other -- useful in Layer%s which simply perform a copy
+     *        in their Forward pass.
+     *
+     * This deallocates the SyncedMemory holding this Blob's data_, as
+     * shared_ptr calls its destructor when reset with the "=" operator.
+     */
+    void ShareData(const Blob& other);
+    /**
+     * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
+     *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
+     *        in their Forward pass.
+     *
+     * This deallocates the SyncedMemory holding this Blob's diff_, as
+     * shared_ptr calls its destructor when reset with the "=" operator.
+     */
+    void ShareDiff(const Blob& other);
+    void set_data_layer() {
+      data_->set_data_layer();
+      diff_->set_data_layer();
+    }
 
-		bool ShapeEquals(const BlobProto& other);
+    bool ShapeEquals(const BlobProto& other);
 
-	protected:
-		shared_ptr<SyncedMemory> data_;
-		shared_ptr<SyncedMemory> diff_;
-		vector<int> shape_;
-		int count_;
-		int capacity_;
+  protected:
+    shared_ptr<SyncedMemory> data_;
+    shared_ptr<SyncedMemory> diff_;
+    vector<int> shape_;
+    int count_;
+    int capacity_;
 
-		DISABLE_COPY_AND_ASSIGN (Blob);
+    DISABLE_COPY_AND_ASSIGN (Blob);
 };
 // class Blob
 
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 0f3a7667..df99c7cf 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -185,81 +185,81 @@ void GlobalInit(int* pargc, char*** pargv);
 // A singleton class to hold common caffe stuff, such as the handler that
 // caffe is going to use for cublas, curand, etc.
 class Caffe {
-	public:
-		~Caffe();
-		inline static Caffe& Get() {
-			if (!singleton_.get()) {
-				singleton_.reset(new Caffe());
-			}
-			return *singleton_;
-		}
-		enum Brew {
-			CPU, GPU, APU
-		};
+  public:
+    ~Caffe();
+    inline static Caffe& Get() {
+      if (!singleton_.get()) {
+        singleton_.reset(new Caffe());
+      }
+      return *singleton_;
+    }
+    enum Brew {
+      CPU, GPU, APU
+    };
 
-		// This random number generator facade hides boost and CUDA rng
-		// implementation from one another (for cross-platform compatibility).
-		class RNG {
-			public:
-				RNG();
-				explicit RNG(unsigned int seed);
-				explicit RNG(const RNG&);
-				RNG& operator=(const RNG&);
-				void* generator();
-				private:
-				class Generator;
-				shared_ptr<Generator> generator_;
-		};
+    // This random number generator facade hides boost and CUDA rng
+    // implementation from one another (for cross-platform compatibility).
+    class RNG {
+      public:
+        RNG();
+        explicit RNG(unsigned int seed);
+        explicit RNG(const RNG&);
+        RNG& operator=(const RNG&);
+        void* generator();
+      private:
+        class Generator;
+        shared_ptr<Generator> generator_;
+    };
 
-		// Getters for boost rng, curand, and cublas handles
-		inline static RNG& rng_stream() {
-			if (!Get().random_generator_) {
-				Get().random_generator_.reset(new RNG());
-			}
-			return *(Get().random_generator_);
-		}
+    // Getters for boost rng, curand, and cublas handles
+    inline static RNG& rng_stream() {
+      if (!Get().random_generator_) {
+        Get().random_generator_.reset(new RNG());
+      }
+      return *(Get().random_generator_);
+    }
 #ifndef CPU_ONLY
-		//inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-		//inline static curandGenerator_t curand_generator() {
-		//  return Get().curand_generator_;
-		//}
+    //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+    //inline static curandGenerator_t curand_generator() {
+    //  return Get().curand_generator_;
+    //}
 #endif
 
-		// Returns the mode: running on CPU or GPU.
-		inline static Brew mode() {
-			return Get().mode_;
-		}
-		// The setters for the variables
-		// Sets the mode. It is recommended that you don't change the mode halfway
-		// into the program since that may cause allocation of pinned memory being
-		// freed in a non-pinned way, which may cause problems - I haven't verified
-		// it personally but better to note it here in the header file.
-		inline static void set_mode(Brew mode) {
-			Get().mode_ = mode;
-		}
-		// Sets the random seed of both boost and curand
-		static void set_random_seed(const unsigned int seed);
-		// Sets the device. Since we have cublas and curand stuff, set device also
-		// requires us to reset those values.
-		static void SetDevice(const int device_id);
-		// Prints the current GPU status.
-		static void DeviceQuery();
+    // Returns the mode: running on CPU or GPU.
+    inline static Brew mode() {
+      return Get().mode_;
+    }
+    // The setters for the variables
+    // Sets the mode. It is recommended that you don't change the mode halfway
+    // into the program since that may cause allocation of pinned memory being
+    // freed in a non-pinned way, which may cause problems - I haven't verified
+    // it personally but better to note it here in the header file.
+    inline static void set_mode(Brew mode) {
+      Get().mode_ = mode;
+    }
+    // Sets the random seed of both boost and curand
+    static void set_random_seed(const unsigned int seed);
+    // Sets the device. Since we have cublas and curand stuff, set device also
+    // requires us to reset those values.
+    static void SetDevice(const int device_id);
+    // Prints the current GPU status.
+    static void DeviceQuery();
 
-	protected:
+  protected:
 #ifndef CPU_ONLY
-		//cublasHandle_t cublas_handle_;
-		//curandGenerator_t curand_generator_;
+    //cublasHandle_t cublas_handle_;
+    //curandGenerator_t curand_generator_;
 #endif
-		shared_ptr<RNG> random_generator_;
+    shared_ptr<RNG> random_generator_;
 
-		Brew mode_;
-		static shared_ptr<Caffe> singleton_;
+    Brew mode_;
+    static shared_ptr<Caffe> singleton_;
 
-	private:
-		// The private constructor to avoid duplicate instantiation.
-		Caffe();
+  private:
+    // The private constructor to avoid duplicate instantiation.
+    Caffe();
 
-	DISABLE_COPY_AND_ASSIGN(Caffe);
+  DISABLE_COPY_AND_ASSIGN(Caffe);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index d892b5b5..ab796286 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -27,56 +27,55 @@ namespace caffe {
  */
 template <typename Dtype>
 class ArgMaxLayer: public Layer<Dtype> {
-	public:
-		/**
-		 * @param param provides ArgMaxParameter argmax_param,
-		 *     with ArgMaxLayer options:
-		 *   - top_k (\b optional uint, default 1).
-		 *     the number @f$ K @f$ of maximal items to output.
-		 *   - out_max_val (\b optional bool, default false).
-		 *     if set, output a vector of pairs (max_ind, max_val) for each image.
-		 */
-		explicit ArgMaxLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "ArgMax";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
-		 *      @f$ (N \times 2 \times K \times 1) @f$
-		 *      the computed outputs @f$
-		 *       y_n = \arg\max\limits_i x_{ni}
-		 *      @f$ (for @f$ K = 1 @f$).
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		/// @brief Not implemented (non-differentiable function)
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-			NOT_IMPLEMENTED;
-		}
-		bool out_max_val_;
-		size_t top_k_;
+  public:
+    /**
+     * @param param provides ArgMaxParameter argmax_param,
+     *     with ArgMaxLayer options:
+     *   - top_k (\b optional uint, default 1).
+     *     the number @f$ K @f$ of maximal items to output.
+     *   - out_max_val (\b optional bool, default false).
+     *     if set, output a vector of pairs (max_ind, max_val) for each image.
+     */
+    explicit ArgMaxLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "ArgMax";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
+     *      @f$ (N \times 2 \times K \times 1) @f$
+     *      the computed outputs @f$
+     *       y_n = \arg\max\limits_i x_{ni}
+     *      @f$ (for @f$ K = 1 @f$).
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /// @brief Not implemented (non-differentiable function)
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      NOT_IMPLEMENTED;
+    }
+    bool out_max_val_;
+    size_t top_k_;
 };
 
 /**
@@ -85,79 +84,78 @@ class ArgMaxLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ConcatLayer: public Layer<Dtype> {
-	public:
-		explicit ConcatLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Concat";
-		}
-		virtual inline int MinBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 2+)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x_1 @f$
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x_2 @f$
-		 *   -# ...
-		 *   - K @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x_K @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-		 *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-		 *      the concatenated output @f$
-		 *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *        respect to the outputs
-		 *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-		 *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to concatenated outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length K), into which the top gradient
-		 *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
-		 *        inputs @f$
-		 *        \left[ \begin{array}{cccc}
-		 *          \frac{\partial E}{\partial x_1} &
-		 *          \frac{\partial E}{\partial x_2} &
-		 *          ... &
-		 *          \frac{\partial E}{\partial x_K}
-		 *        \end{array} \right] =
-		 *        \frac{\partial E}{\partial y}
-		 *        @f$
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int count_;
-		int num_concats_;
-		int concat_input_size_;
-		int concat_axis_;
+  public:
+    explicit ConcatLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Concat";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_1 @f$
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_2 @f$
+     *   -# ...
+     *   - K @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_K @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+     *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+     *      the concatenated output @f$
+     *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the concatenate inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *        respect to the outputs
+     *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+     *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to concatenated outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length K), into which the top gradient
+     *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
+     *        inputs @f$
+     *        \left[ \begin{array}{cccc}
+     *          \frac{\partial E}{\partial x_1} &
+     *          \frac{\partial E}{\partial x_2} &
+     *          ... &
+     *          \frac{\partial E}{\partial x_K}
+     *        \end{array} \right] =
+     *        \frac{\partial E}{\partial y}
+     *        @f$
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    int num_concats_;
+    int concat_input_size_;
+    int concat_axis_;
 };
 
 /**
@@ -168,41 +166,40 @@ class ConcatLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class EltwiseLayer: public Layer<Dtype> {
-	public:
-		explicit EltwiseLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Eltwise";
-		}
-		virtual inline int MinBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		EltwiseParameter_EltwiseOp op_;
-		vector<Dtype> coeffs_;
-		Blob<int> max_idx_;
-
-		bool stable_prod_grad_;
+  public:
+    explicit EltwiseLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Eltwise";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    EltwiseParameter_EltwiseOp op_;
+    vector<Dtype> coeffs_;
+    Blob<int> max_idx_;
+
+    bool stable_prod_grad_;
 };
 
 /**
@@ -213,67 +210,66 @@ class EltwiseLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class FilterLayer: public Layer<Dtype> {
-	public:
-		explicit FilterLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Filter";
-		}
-		virtual inline int MinBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 2+)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs to be filtered @f$ x_1 @f$
-		 *   -# ...
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs to be filtered @f$ x_K @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the selector blob
-		 * @param top output Blob vector (length 1+)
-		 *   -# @f$ (S \times C \times H \times W) @f$ ()
-		 *        the filtered output @f$ x_1 @f$
-		 *        where S is the number of items
-		 *        that haven't been filtered
-		 *      @f$ (S \times C \times H \times W) @f$
-		 *        the filtered output @f$ x_K @f$
-		 *        where S is the number of items
-		 *        that haven't been filtered
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the forwarded inputs.
-		 *
-		 * @param top output Blob vector (length 1+), providing the error gradient with
-		 *        respect to the outputs
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 2+), into which the top error
-		 *        gradient is copied
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		bool first_reshape_;
-		vector<int> indices_to_forward_;
+  public:
+    explicit FilterLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Filter";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs to be filtered @f$ x_1 @f$
+     *   -# ...
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs to be filtered @f$ x_K @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the selector blob
+     * @param top output Blob vector (length 1+)
+     *   -# @f$ (S \times C \times H \times W) @f$ ()
+     *        the filtered output @f$ x_1 @f$
+     *        where S is the number of items
+     *        that haven't been filtered
+     *      @f$ (S \times C \times H \times W) @f$
+     *        the filtered output @f$ x_K @f$
+     *        where S is the number of items
+     *        that haven't been filtered
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the forwarded inputs.
+     *
+     * @param top output Blob vector (length 1+), providing the error gradient with
+     *        respect to the outputs
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2+), into which the top error
+     *        gradient is copied
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    bool first_reshape_;
+    vector<int> indices_to_forward_;
 };
 
 /**
@@ -288,47 +284,46 @@ class FilterLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class FlattenLayer: public Layer<Dtype> {
-	public:
-		explicit FlattenLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Flatten";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 2+)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times CHW \times 1 \times 1) @f$
-		 *      the outputs -- i.e., the (virtually) copied, flattened inputs
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the concatenate inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *        respect to the outputs
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length K), into which the top error
-		 *        gradient is (virtually) copied
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit FlattenLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Flatten";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times CHW \times 1 \times 1) @f$
+     *      the outputs -- i.e., the (virtually) copied, flattened inputs
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the concatenate inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *        respect to the outputs
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length K), into which the top error
+     *        gradient is (virtually) copied
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -339,41 +334,40 @@ class FlattenLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class InnerProductLayer: public Layer<Dtype> {
-	public:
-		explicit InnerProductLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "InnerProduct";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int M_;
-		int K_;
-		int N_;
-		bool bias_term_;
-		Blob<Dtype> bias_multiplier_;
+  public:
+    explicit InnerProductLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "InnerProduct";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int M_;
+    int K_;
+    int N_;
+    bool bias_term_;
+    Blob<Dtype> bias_multiplier_;
 };
 
 /**
@@ -383,39 +377,38 @@ class InnerProductLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class MVNLayer: public Layer<Dtype> {
-	public:
-		explicit MVNLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "MVN";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Blob<Dtype> mean_, variance_, temp_;
-
-		/// sum_multiplier is used to carry out sum using BLAS
-		Blob<Dtype> sum_multiplier_;
-		Dtype eps_;
+  public:
+    explicit MVNLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MVN";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> mean_, variance_, temp_;
+
+    /// sum_multiplier is used to carry out sum using BLAS
+    Blob<Dtype> sum_multiplier_;
+    Dtype eps_;
 };
 
 /*
@@ -426,48 +419,47 @@ class MVNLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ReshapeLayer: public Layer<Dtype> {
-	public:
-		explicit ReshapeLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Reshape";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-
-		/// @brief vector of axes indices whose dimensions we'll copy from the bottom
-		vector<int> copy_axes_;
-		/// @brief the index of the axis whose dimension we infer, or -1 if none
-		int inferred_axis_;
-		/// @brief the product of the "constant" output dimensions
-		int constant_count_;
+  public:
+    explicit ReshapeLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Reshape";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+    /// @brief vector of axes indices whose dimensions we'll copy from the bottom
+    vector<int> copy_axes_;
+    /// @brief the index of the axis whose dimension we infer, or -1 if none
+    int inferred_axis_;
+    /// @brief the product of the "constant" output dimensions
+    int constant_count_;
 };
 
 /**
@@ -479,48 +471,47 @@ class ReshapeLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ReductionLayer: public Layer<Dtype> {
-	public:
-		explicit ReductionLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Reduction";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		/// @brief the reduction operation performed by the layer
-		ReductionParameter_ReductionOp op_;
-		/// @brief a scalar coefficient applied to all outputs
-		Dtype coeff_;
-		/// @brief the index of the first input axis to reduce
-		int axis_;
-		/// @brief the number of reductions performed
-		int num_;
-		/// @brief the input size of each reduction
-		int dim_;
-		/// @brief a helper Blob used for summation (op_ == SUM)
-		Blob<Dtype> sum_multiplier_;
+  public:
+    explicit ReductionLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Reduction";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// @brief the reduction operation performed by the layer
+    ReductionParameter_ReductionOp op_;
+    /// @brief a scalar coefficient applied to all outputs
+    Dtype coeff_;
+    /// @brief the index of the first input axis to reduce
+    int axis_;
+    /// @brief the number of reductions performed
+    int num_;
+    /// @brief the input size of each reduction
+    int dim_;
+    /// @brief a helper Blob used for summation (op_ == SUM)
+    Blob<Dtype> sum_multiplier_;
 };
 
 /**
@@ -529,37 +520,36 @@ class ReductionLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class SilenceLayer: public Layer<Dtype> {
-	public:
-		explicit SilenceLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-
-		virtual inline const char* type() const {
-			return "Silence";
-		}
-		virtual inline int MinBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 0;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-		// We can't define Forward_gpu here, since STUB_GPU will provide
-		// its own definition for CPU_ONLY mode.
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit SilenceLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "Silence";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 0;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    // We can't define Forward_gpu here, since STUB_GPU will provide
+    // its own definition for CPU_ONLY mode.
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -569,42 +559,41 @@ class SilenceLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class SoftmaxLayer: public Layer<Dtype> {
-	public:
-		explicit SoftmaxLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		~SoftmaxLayer();
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Softmax";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int outer_num_;
-		int inner_num_;
-		int softmax_axis_;
-		/// sum_multiplier is used to carry out sum using BLAS
-		Blob<Dtype> sum_multiplier_;
-		/// scale is an intermediate Blob to hold temporary results.
-		Blob<Dtype> scale_;
+  public:
+    explicit SoftmaxLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    ~SoftmaxLayer();
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Softmax";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int outer_num_;
+    int inner_num_;
+    int softmax_axis_;
+    /// sum_multiplier is used to carry out sum using BLAS
+    Blob<Dtype> sum_multiplier_;
+    /// scale is an intermediate Blob to hold temporary results.
+    Blob<Dtype> scale_;
 };
 
 #ifdef USE_CUDNN
@@ -614,25 +603,25 @@ class SoftmaxLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
-	public:
-	explicit CuDNNSoftmaxLayer(const LayerParameter& param)
-	: SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNSoftmaxLayer();
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t handle_;
-	cudnnTensorDescriptor_t bottom_desc_;
-	cudnnTensorDescriptor_t top_desc_;
+  public:
+  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
+  : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNSoftmaxLayer();
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -644,36 +633,35 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
  */
 template <typename Dtype>
 class SplitLayer: public Layer<Dtype> {
-	public:
-		explicit SplitLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Split";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int count_;
-		cl_kernel gpu_add_kernel;
+  public:
+    explicit SplitLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Split";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    cl_kernel gpu_add_kernel;
 };
 
 /**
@@ -684,41 +672,40 @@ class SplitLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class SliceLayer: public Layer<Dtype> {
-	public:
-		explicit SliceLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Slice";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 2;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int count_;
-		int num_slices_;
-		int slice_size_;
-		int slice_axis_;
-		vector<int> slice_point_;
+  public:
+    explicit SliceLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Slice";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    int num_slices_;
+    int slice_size_;
+    int slice_axis_;
+    vector<int> slice_point_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index e93c4fe8..d4f526b3 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -26,96 +26,94 @@ namespace caffe {
  */
 template <typename Dtype>
 class BaseDataLayer: public Layer<Dtype> {
-	public:
-		explicit BaseDataLayer(const LayerParameter& param);
-		// LayerSetUp: implements common data layer setup functionality, and calls
-		// DataLayerSetUp to do special data layer setup for individual layer types.
-		// This method may not be overridden except by the BasePrefetchingDataLayer.
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-		// Data layers have no bottoms, so reshaping is trivial.
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-
-	protected:
-		TransformationParameter transform_param_;
-		shared_ptr<DataTransformer<Dtype> > data_transformer_;
-		bool output_labels_;
+  public:
+    explicit BaseDataLayer(const LayerParameter& param);
+    // LayerSetUp: implements common data layer setup functionality, and calls
+    // DataLayerSetUp to do special data layer setup for individual layer types.
+    // This method may not be overridden except by the BasePrefetchingDataLayer.
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+  protected:
+    TransformationParameter transform_param_;
+    shared_ptr<DataTransformer<Dtype> > data_transformer_;
+    bool output_labels_;
 };
 
 template <typename Dtype>
-class BasePrefetchingDataLayer:
-		public BaseDataLayer<Dtype>, public InternalThread {
-	public:
-		explicit BasePrefetchingDataLayer(const LayerParameter& param)
-		:
-				BaseDataLayer<Dtype>(param) {
-		}
-		// LayerSetUp: implements common data layer setup functionality, and calls
-		// DataLayerSetUp to do special data layer setup for individual layer types.
-		// This method may not be overridden.
-		void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual void CreatePrefetchThread();
-		virtual void JoinPrefetchThread();
-		// The thread's function
-		virtual void InternalThreadEntry() {
-		}
-
-	protected:
-		Blob<Dtype> prefetch_data_;
-		Blob<Dtype> prefetch_label_;
-		Blob<Dtype> transformed_data_;
+class BasePrefetchingDataLayer: public BaseDataLayer<Dtype>,
+    public InternalThread {
+  public:
+    explicit BasePrefetchingDataLayer(const LayerParameter& param)
+        : BaseDataLayer<Dtype>(param) {
+    }
+    // LayerSetUp: implements common data layer setup functionality, and calls
+    // DataLayerSetUp to do special data layer setup for individual layer types.
+    // This method may not be overridden.
+    void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void CreatePrefetchThread();
+    virtual void JoinPrefetchThread();
+    // The thread's function
+    virtual void InternalThreadEntry() {
+    }
+
+  protected:
+    Blob<Dtype> prefetch_data_;
+    Blob<Dtype> prefetch_label_;
+    Blob<Dtype> transformed_data_;
 };
 
 template <typename Dtype>
 class DataLayer: public BasePrefetchingDataLayer<Dtype> {
-	public:
-		explicit DataLayer(const LayerParameter& param)
-		:
-				BasePrefetchingDataLayer<Dtype>(param) {
-		}
-		virtual ~DataLayer();
-		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Data";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-		virtual inline int MaxTopBlobs() const {
-			return 2;
-		}
-
-	protected:
-		virtual void InternalThreadEntry();
-
-		shared_ptr<db::DB> db_;
-		shared_ptr<db::Cursor> cursor_;
+  public:
+    explicit DataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~DataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Data";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline int MaxTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual void InternalThreadEntry();
+
+    shared_ptr<db::DB> db_;
+    shared_ptr<db::Cursor> cursor_;
 };
 
 /**
@@ -125,42 +123,41 @@ class DataLayer: public BasePrefetchingDataLayer<Dtype> {
  */
 template <typename Dtype>
 class DummyDataLayer: public Layer<Dtype> {
-	public:
-		explicit DummyDataLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		// Data layers have no bottoms, so reshaping is trivial.
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-
-		virtual inline const char* type() const {
-			return "DummyData";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-
-		vector<shared_ptr<Filler<Dtype> > > fillers_;
-		vector<bool> refill_;
+  public:
+    explicit DummyDataLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "DummyData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+    vector<shared_ptr<Filler<Dtype> > > fillers_;
+    vector<bool> refill_;
 };
 
 /**
@@ -170,51 +167,50 @@ class DummyDataLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class HDF5DataLayer: public Layer<Dtype> {
-	public:
-		explicit HDF5DataLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual ~HDF5DataLayer();
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		// Data layers have no bottoms, so reshaping is trivial.
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-
-		virtual inline const char* type() const {
-			return "HDF5Data";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-		}
-		virtual void LoadHDF5FileData(const char* filename);
-
-		std::vector<std::string> hdf_filenames_;
-		unsigned int num_files_;
-		unsigned int current_file_;
-		hsize_t current_row_;
-		std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
-		std::vector<unsigned int> data_permutation_;
-		std::vector<unsigned int> file_permutation_;
+  public:
+    explicit HDF5DataLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual ~HDF5DataLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "HDF5Data";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void LoadHDF5FileData(const char* filename);
+
+    std::vector<std::string> hdf_filenames_;
+    unsigned int num_files_;
+    unsigned int current_file_;
+    hsize_t current_row_;
+    std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
+    std::vector<unsigned int> data_permutation_;
+    std::vector<unsigned int> file_permutation_;
 };
 
 /**
@@ -224,50 +220,49 @@ class HDF5DataLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class HDF5OutputLayer: public Layer<Dtype> {
-	public:
-		explicit HDF5OutputLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param), file_opened_(false) {
-		}
-		virtual ~HDF5OutputLayer();
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		// Data layers have no bottoms, so reshaping is trivial.
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
-
-		virtual inline const char* type() const {
-			return "HDF5Output";
-		}
-		// TODO: no limit on the number of blobs
-		virtual inline int ExactNumBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 0;
-		}
-
-		inline std::string file_name() const {
-			return file_name_;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void SaveBlobs();
-
-		bool file_opened_;
-		std::string file_name_;
-		hid_t file_id_;
-		Blob<Dtype> data_blob_;
-		Blob<Dtype> label_blob_;
+  public:
+    explicit HDF5OutputLayer(const LayerParameter& param)
+        : Layer<Dtype>(param), file_opened_(false) {
+    }
+    virtual ~HDF5OutputLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "HDF5Output";
+    }
+    // TODO: no limit on the number of blobs
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 0;
+    }
+
+    inline std::string file_name() const {
+      return file_name_;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void SaveBlobs();
+
+    bool file_opened_;
+    std::string file_name_;
+    hid_t file_id_;
+    Blob<Dtype> data_blob_;
+    Blob<Dtype> label_blob_;
 };
 
 /**
@@ -277,32 +272,31 @@ class HDF5OutputLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
-	public:
-		explicit ImageDataLayer(const LayerParameter& param)
-		:
-				BasePrefetchingDataLayer<Dtype>(param) {
-		}
-		virtual ~ImageDataLayer();
-		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "ImageData";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 2;
-		}
-
-	protected:
-		shared_ptr<Caffe::RNG> prefetch_rng_;
-		virtual void ShuffleImages();
-		virtual void InternalThreadEntry();
-
-		vector<std::pair<std::string, int> > lines_;
-		int lines_id_;
+  public:
+    explicit ImageDataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~ImageDataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "ImageData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    shared_ptr<Caffe::RNG> prefetch_rng_;
+    virtual void ShuffleImages();
+    virtual void InternalThreadEntry();
+
+    vector<std::pair<std::string, int> > lines_;
+    int lines_id_;
 };
 
 /**
@@ -312,58 +306,57 @@ class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
  */
 template <typename Dtype>
 class MemoryDataLayer: public BaseDataLayer<Dtype> {
-	public:
-		explicit MemoryDataLayer(const LayerParameter& param)
-		:
-				BaseDataLayer<Dtype>(param), has_new_data_(false) {
-		}
-		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "MemoryData";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 2;
-		}
-
-		virtual void AddDatumVector(const vector<Datum>& datum_vector);
-		virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
-				const vector<int>& labels);
-
-		// Reset should accept const pointers, but can't, because the memory
-		//  will be given to Blob, which is mutable
-		void Reset(Dtype* data, Dtype* label, int n);
-		void set_batch_size(int new_size);
-
-		int batch_size() {
-			return batch_size_;
-		}
-		int channels() {
-			return channels_;
-		}
-		int height() {
-			return height_;
-		}
-		int width() {
-			return width_;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		int batch_size_, channels_, height_, width_, size_;
-		Dtype* data_;
-		Dtype* labels_;
-		int n_;
-		size_t pos_;
-		Blob<Dtype> added_data_;
-		Blob<Dtype> added_label_;
-		bool has_new_data_;
+  public:
+    explicit MemoryDataLayer(const LayerParameter& param)
+        : BaseDataLayer<Dtype>(param), has_new_data_(false) {
+    }
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MemoryData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+    virtual void AddDatumVector(const vector<Datum>& datum_vector);
+    virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
+        const vector<int>& labels);
+
+    // Reset should accept const pointers, but can't, because the memory
+    //  will be given to Blob, which is mutable
+    void Reset(Dtype* data, Dtype* label, int n);
+    void set_batch_size(int new_size);
+
+    int batch_size() {
+      return batch_size_;
+    }
+    int channels() {
+      return channels_;
+    }
+    int height() {
+      return height_;
+    }
+    int width() {
+      return width_;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    int batch_size_, channels_, height_, width_, size_;
+    Dtype* data_;
+    Dtype* labels_;
+    int n_;
+    size_t pos_;
+    Blob<Dtype> added_data_;
+    Blob<Dtype> added_label_;
+    bool has_new_data_;
 };
 
 /**
@@ -374,42 +367,41 @@ class MemoryDataLayer: public BaseDataLayer<Dtype> {
  */
 template <typename Dtype>
 class WindowDataLayer: public BasePrefetchingDataLayer<Dtype> {
-	public:
-		explicit WindowDataLayer(const LayerParameter& param)
-		:
-				BasePrefetchingDataLayer<Dtype>(param) {
-		}
-		virtual ~WindowDataLayer();
-		virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "WindowData";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 0;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 2;
-		}
-
-	protected:
-		virtual unsigned int PrefetchRand();
-		virtual void InternalThreadEntry();
-
-		shared_ptr<Caffe::RNG> prefetch_rng_;
-		vector<std::pair<std::string, vector<int> > > image_database_;
-		enum WindowField {
-			IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM
-		};
-		vector<vector<float> > fg_windows_;
-		vector<vector<float> > bg_windows_;
-		Blob<Dtype> data_mean_;
-		vector<Dtype> mean_values_;
-		bool has_mean_file_;
-		bool has_mean_values_;
-		bool cache_images_;
-		vector<std::pair<std::string, Datum> > image_database_cache_;
+  public:
+    explicit WindowDataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~WindowDataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "WindowData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual unsigned int PrefetchRand();
+    virtual void InternalThreadEntry();
+
+    shared_ptr<Caffe::RNG> prefetch_rng_;
+    vector<std::pair<std::string, vector<int> > > image_database_;
+    enum WindowField {
+      IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM
+    };
+    vector<vector<float> > fg_windows_;
+    vector<vector<float> > bg_windows_;
+    Blob<Dtype> data_mean_;
+    vector<Dtype> mean_values_;
+    bool has_mean_file_;
+    bool has_mean_values_;
+    bool cache_images_;
+    vector<std::pair<std::string, Datum> > image_database_cache_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index c283a244..daa4eee0 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -15,134 +15,134 @@ namespace caffe {
  */
 template <typename Dtype>
 class DataTransformer {
-	public:
-		explicit DataTransformer(const TransformationParameter& param, Phase phase);
-		virtual ~DataTransformer() {
-		}
-
-		/**
-		 * @brief Initialize the Random number generations if needed by the
-		 *    transformation.
-		 */
-		void InitRand();
-
-		/**
-		 * @brief Applies the transformation defined in the data layer's
-		 * transform_param block to the data.
-		 *
-		 * @param datum
-		 *    Datum containing the data to be transformed.
-		 * @param transformed_blob
-		 *    This is destination blob. It can be part of top blob's data if
-		 *    set_cpu_data() is used. See data_layer.cpp for an example.
-		 */
-		void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
-
-		/**
-		 * @brief Applies the transformation defined in the data layer's
-		 * transform_param block to a vector of Datum.
-		 *
-		 * @param datum_vector
-		 *    A vector of Datum containing the data to be transformed.
-		 * @param transformed_blob
-		 *    This is destination blob. It can be part of top blob's data if
-		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
-		 */
-		void Transform(const vector<Datum> & datum_vector,
-				Blob<Dtype>* transformed_blob);
-
-		/**
-		 * @brief Applies the transformation defined in the data layer's
-		 * transform_param block to a vector of Mat.
-		 *
-		 * @param mat_vector
-		 *    A vector of Mat containing the data to be transformed.
-		 * @param transformed_blob
-		 *    This is destination blob. It can be part of top blob's data if
-		 *    set_cpu_data() is used. See memory_layer.cpp for an example.
-		 */
-		void Transform(const vector<cv::Mat> & mat_vector,
-				Blob<Dtype>* transformed_blob);
-
-		/**
-		 * @brief Applies the transformation defined in the data layer's
-		 * transform_param block to a cv::Mat
-		 *
-		 * @param cv_img
-		 *    cv::Mat containing the data to be transformed.
-		 * @param transformed_blob
-		 *    This is destination blob. It can be part of top blob's data if
-		 *    set_cpu_data() is used. See image_data_layer.cpp for an example.
-		 */
-		void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
-
-		/**
-		 * @brief Applies the same transformation defined in the data layer's
-		 * transform_param block to all the num images in a input_blob.
-		 *
-		 * @param input_blob
-		 *    A Blob containing the data to be transformed. It applies the same
-		 *    transformation to all the num images in the blob.
-		 * @param transformed_blob
-		 *    This is destination blob, it will contain as many images as the
-		 *    input blob. It can be part of top blob's data.
-		 */
-		void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
-
-		/**
-		 * @brief Infers the shape of transformed_blob will have when
-		 *    the transformation is applied to the data.
-		 *
-		 * @param datum
-		 *    Datum containing the data to be transformed.
-		 */
-		vector<int> InferBlobShape(const Datum& datum);
-		/**
-		 * @brief Infers the shape of transformed_blob will have when
-		 *    the transformation is applied to the data.
-		 *    It uses the first element to infer the shape of the blob.
-		 *
-		 * @param datum_vector
-		 *    A vector of Datum containing the data to be transformed.
-		 */
-		vector<int> InferBlobShape(const vector<Datum> & datum_vector);
-		/**
-		 * @brief Infers the shape of transformed_blob will have when
-		 *    the transformation is applied to the data.
-		 *    It uses the first element to infer the shape of the blob.
-		 *
-		 * @param mat_vector
-		 *    A vector of Mat containing the data to be transformed.
-		 */
-		vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
-		/**
-		 * @brief Infers the shape of transformed_blob will have when
-		 *    the transformation is applied to the data.
-		 *
-		 * @param cv_img
-		 *    cv::Mat containing the data to be transformed.
-		 */
-		vector<int> InferBlobShape(const cv::Mat& cv_img);
-
-	protected:
-		/**
-		 * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
-		 *
-		 * @param n
-		 *    The upperbound (exclusive) value of the random number.
-		 * @return
-		 *    A uniformly random integer value from ({0, 1, ..., n-1}).
-		 */
-		virtual int Rand(int n);
-
-		void Transform(const Datum& datum, Dtype* transformed_data);
-		// Tranformation parameters
-		TransformationParameter param_;
-
-		shared_ptr<Caffe::RNG> rng_;
-		Phase phase_;
-		Blob<Dtype> data_mean_;
-		vector<Dtype> mean_values_;
+  public:
+    explicit DataTransformer(const TransformationParameter& param, Phase phase);
+    virtual ~DataTransformer() {
+    }
+
+    /**
+     * @brief Initialize the Random number generations if needed by the
+     *    transformation.
+     */
+    void InitRand();
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to the data.
+     *
+     * @param datum
+     *    Datum containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See data_layer.cpp for an example.
+     */
+    void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a vector of Datum.
+     *
+     * @param datum_vector
+     *    A vector of Datum containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See memory_layer.cpp for an example.
+     */
+    void Transform(const vector<Datum> & datum_vector,
+        Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a vector of Mat.
+     *
+     * @param mat_vector
+     *    A vector of Mat containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See memory_layer.cpp for an example.
+     */
+    void Transform(const vector<cv::Mat> & mat_vector,
+        Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a cv::Mat
+     *
+     * @param cv_img
+     *    cv::Mat containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See image_data_layer.cpp for an example.
+     */
+    void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the same transformation defined in the data layer's
+     * transform_param block to all the num images in a input_blob.
+     *
+     * @param input_blob
+     *    A Blob containing the data to be transformed. It applies the same
+     *    transformation to all the num images in the blob.
+     * @param transformed_blob
+     *    This is destination blob, it will contain as many images as the
+     *    input blob. It can be part of top blob's data.
+     */
+    void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *
+     * @param datum
+     *    Datum containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const Datum& datum);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *    It uses the first element to infer the shape of the blob.
+     *
+     * @param datum_vector
+     *    A vector of Datum containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const vector<Datum> & datum_vector);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *    It uses the first element to infer the shape of the blob.
+     *
+     * @param mat_vector
+     *    A vector of Mat containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *
+     * @param cv_img
+     *    cv::Mat containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const cv::Mat& cv_img);
+
+  protected:
+    /**
+     * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
+     *
+     * @param n
+     *    The upperbound (exclusive) value of the random number.
+     * @return
+     *    A uniformly random integer value from ({0, 1, ..., n-1}).
+     */
+    virtual int Rand(int n);
+
+    void Transform(const Datum& datum, Dtype* transformed_data);
+    // Tranformation parameters
+    TransformationParameter param_;
+
+    shared_ptr<Caffe::RNG> rng_;
+    Phase phase_;
+    Blob<Dtype> data_mean_;
+    vector<Dtype> mean_values_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 2d71b333..1d9fa6fe 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -33,51 +33,50 @@
 namespace caffe {
 
 class Device {
-	public:
-		Device()
-		:
-				numPlatforms(0), numDevices(0), device_id(INT_MIN) {
-		}
-		~Device();
-		cl_uint numPlatforms;
-		cl_platform_id * platformIDs;
-		char platformName[64];
-		char openclVersion[64];
-		cl_uint numDevices;
-		cl_device_id * DeviceIDs;
+  public:
+    Device()
+        : numPlatforms(0), numDevices(0), device_id(INT_MIN) {
+    }
+    ~Device();
+    cl_uint numPlatforms;
+    cl_platform_id * platformIDs;
+    char platformName[64];
+    char openclVersion[64];
+    cl_uint numDevices;
+    cl_device_id * DeviceIDs;
 
-		cl_context Context;
-		cl_command_queue CommandQueue;
-		cl_command_queue CommandQueue_helper;
-		cl_program Program;
-		cl_device_id * pDevices;
-		int device_id;
+    cl_context Context;
+    cl_command_queue CommandQueue;
+    cl_command_queue CommandQueue_helper;
+    cl_program Program;
+    cl_device_id * pDevices;
+    int device_id;
 
-		clblasOrder col;
-		clblasOrder row;
-		std::map<std::string, cl_kernel> Kernels;
+    clblasOrder col;
+    clblasOrder row;
+    std::map<std::string, cl_kernel> Kernels;
 
-		cl_int Init(int device_id = -1);
-		cl_int ConvertToString(std::string pFileName, std::string &Str);
-		void DisplayPlatformInfo();
-		void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
+    cl_int Init(int device_id = -1);
+    cl_int ConvertToString(std::string pFileName, std::string &Str);
+    void DisplayPlatformInfo();
+    void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
 
-		void GetDeviceInfo();
-		void DeviceQuery();
-		int GetDevice() {
-			return device_id;
-		}
-		;
-		void BuildProgram(std::string kernel_dir);
+    void GetDeviceInfo();
+    void DeviceQuery();
+    int GetDevice() {
+      return device_id;
+    }
+    ;
+    void BuildProgram(std::string kernel_dir);
 
-		template <typename T>
-		void DisplayDeviceInfo(cl_device_id id, cl_device_info name,
-				std::string str);
-		template <typename T>
-		void appendBitfield(T info, T value, std::string name, std::string &str);
+    template <typename T>
+    void DisplayDeviceInfo(cl_device_id id, cl_device_info name,
+        std::string str);
+    template <typename T>
+    void appendBitfield(T info, T value, std::string name, std::string &str);
 
-		cl_kernel GetKernel(std::string kernel_name);
-		void ReleaseKernels();
+    cl_kernel GetKernel(std::string kernel_name);
+    void ReleaseKernels();
 };
 extern std::string buildOption;
 extern Device amdDevice;
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index c431dc94..ab9d6b39 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -18,92 +18,88 @@ namespace caffe {
 /// @brief Fills a Blob with constant or randomly-generated data.
 template <typename Dtype>
 class Filler {
-	public:
-		explicit Filler(const FillerParameter& param)
-		:
-				filler_param_(param) {
-		}
-		virtual ~Filler() {
-		}
-		virtual void Fill(Blob<Dtype>* blob) = 0;
-		protected:
-		FillerParameter filler_param_;
+  public:
+    explicit Filler(const FillerParameter& param)
+        : filler_param_(param) {
+    }
+    virtual ~Filler() {
+    }
+    virtual void Fill(Blob<Dtype>* blob) = 0;
+  protected:
+    FillerParameter filler_param_;
 };
 // class Filler
 
 /// @brief Fills a Blob with constant values @f$ x = 0 @f$.
 template <typename Dtype>
 class ConstantFiller: public Filler<Dtype> {
-	public:
-		explicit ConstantFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			Dtype* data = blob->mutable_cpu_data();
-			const int count = blob->count();
-			const Dtype value = this->filler_param_.value();
-			CHECK(count);
-			for (int i = 0; i < count; ++i) {
-				data[i] = value;
-			}
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit ConstantFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      const int count = blob->count();
+      const Dtype value = this->filler_param_.value();
+      CHECK(count);
+      for (int i = 0; i < count; ++i) {
+        data[i] = value;
+      }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
 template <typename Dtype>
 class UniformFiller: public Filler<Dtype> {
-	public:
-		explicit UniformFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			CHECK(blob->count());
-			caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
-					Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit UniformFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
+          Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
 template <typename Dtype>
 class GaussianFiller: public Filler<Dtype> {
-	public:
-		explicit GaussianFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			Dtype* data = blob->mutable_cpu_data();
-			CHECK(blob->count());
-			caffe_rng_gaussian<Dtype>(blob->count(),
-					Dtype(this->filler_param_.mean()),
-					Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
-			int sparse = this->filler_param_.sparse();
-			CHECK_GE(sparse, -1);
-			if (sparse >= 0) {
-				// Sparse initialization is implemented for "weight" blobs; i.e. matrices.
-				// These have num == channels == 1; width is number of inputs; height is
-				// number of outputs.  The 'sparse' variable specifies the mean number
-				// of non-zero input weights for a given output.
-				CHECK_GE(blob->num_axes(), 1);
-				const int num_outputs = blob->shape(0);
-				Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
-				rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
-				int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
-				caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
-				for (int i = 0; i < blob->count(); ++i) {
-					data[i] *= mask[i];
-				}
-			}
-		}
+  public:
+    explicit GaussianFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      CHECK(blob->count());
+      caffe_rng_gaussian<Dtype>(blob->count(),
+          Dtype(this->filler_param_.mean()), Dtype(this->filler_param_.std()),
+          blob->mutable_cpu_data());
+      int sparse = this->filler_param_.sparse();
+      CHECK_GE(sparse, -1);
+      if (sparse >= 0) {
+        // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
+        // These have num == channels == 1; width is number of inputs; height is
+        // number of outputs.  The 'sparse' variable specifies the mean number
+        // of non-zero input weights for a given output.
+        CHECK_GE(blob->num_axes(), 1);
+        const int num_outputs = blob->shape(0);
+        Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
+        rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
+        int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+        caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
+        for (int i = 0; i < blob->count(); ++i) {
+          data[i] *= mask[i];
+        }
+      }
+    }
 
-	protected:
-		shared_ptr<SyncedMemory> rand_vec_;
+  protected:
+    shared_ptr<SyncedMemory> rand_vec_;
 };
 
 /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
@@ -111,31 +107,30 @@ class GaussianFiller: public Filler<Dtype> {
  */
 template <typename Dtype>
 class PositiveUnitballFiller: public Filler<Dtype> {
-	public:
-		explicit PositiveUnitballFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			Dtype* data = blob->mutable_cpu_data();
-			DCHECK(blob->count());
-			caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
-			// We expect the filler to not be called very frequently, so we will
-			// just use a simple implementation
-			int dim = blob->count() / blob->num();
-			CHECK(dim);
-			for (int i = 0; i < blob->num(); ++i) {
-				Dtype sum = 0;
-				for (int j = 0; j < dim; ++j) {
-					sum += data[i * dim + j];
-				}
-				for (int j = 0; j < dim; ++j) {
-					data[i * dim + j] /= sum;
-				}
-			}
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit PositiveUnitballFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      DCHECK(blob->count());
+      caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
+      // We expect the filler to not be called very frequently, so we will
+      // just use a simple implementation
+      int dim = blob->count() / blob->num();
+      CHECK(dim);
+      for (int i = 0; i < blob->num(); ++i) {
+        Dtype sum = 0;
+        for (int j = 0; j < dim; ++j) {
+          sum += data[i * dim + j];
+        }
+        for (int j = 0; j < dim; ++j) {
+          data[i * dim + j] /= sum;
+        }
+      }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /**
@@ -156,29 +151,28 @@ class PositiveUnitballFiller: public Filler<Dtype> {
  */
 template <typename Dtype>
 class XavierFiller: public Filler<Dtype> {
-	public:
-		explicit XavierFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			CHECK(blob->count());
-			int fan_in = blob->count() / blob->num();
-			int fan_out = blob->count() / blob->channels();
-			Dtype n = fan_in;  // default to fan_in
-			if (this->filler_param_.variance_norm() ==
-					FillerParameter_VarianceNorm_AVERAGE) {
-				n = (fan_in + fan_out) / Dtype(2);
-			} else if (this->filler_param_.variance_norm() ==
-					FillerParameter_VarianceNorm_FAN_OUT) {
-				n = fan_out;
-			}
-			Dtype scale = sqrt(Dtype(3) / n);
-			caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
-					blob->mutable_cpu_data());
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit XavierFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      int fan_in = blob->count() / blob->num();
+      int fan_out = blob->count() / blob->channels();
+      Dtype n = fan_in;  // default to fan_in
+      if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_AVERAGE) {
+        n = (fan_in + fan_out) / Dtype(2);
+      } else if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_FAN_OUT) {
+        n = fan_out;
+      }
+      Dtype scale = sqrt(Dtype(3) / n);
+      caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
+          blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /**
@@ -200,29 +194,28 @@ class XavierFiller: public Filler<Dtype> {
  */
 template <typename Dtype>
 class MSRAFiller: public Filler<Dtype> {
-	public:
-		explicit MSRAFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			CHECK(blob->count());
-			int fan_in = blob->count() / blob->num();
-			int fan_out = blob->count() / blob->channels();
-			Dtype n = fan_in;  // default to fan_in
-			if (this->filler_param_.variance_norm() ==
-					FillerParameter_VarianceNorm_AVERAGE) {
-				n = (fan_in + fan_out) / Dtype(2);
-			} else if (this->filler_param_.variance_norm() ==
-					FillerParameter_VarianceNorm_FAN_OUT) {
-				n = fan_out;
-			}
-			Dtype std = sqrt(Dtype(2) / n);
-			caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
-					blob->mutable_cpu_data());
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit MSRAFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      int fan_in = blob->count() / blob->num();
+      int fan_out = blob->count() / blob->channels();
+      Dtype n = fan_in;  // default to fan_in
+      if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_AVERAGE) {
+        n = (fan_in + fan_out) / Dtype(2);
+      } else if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_FAN_OUT) {
+        n = fan_out;
+      }
+      Dtype std = sqrt(Dtype(2) / n);
+      caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
+          blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /*!
@@ -260,25 +253,24 @@ class MSRAFiller: public Filler<Dtype> {
  */
 template <typename Dtype>
 class BilinearFiller: public Filler<Dtype> {
-	public:
-		explicit BilinearFiller(const FillerParameter& param)
-		:
-				Filler<Dtype>(param) {
-		}
-		virtual void Fill(Blob<Dtype>* blob) {
-			CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
-			CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
-			Dtype* data = blob->mutable_cpu_data();
-			int f = ceil(blob->width() / 2.);
-			float c = (2 * f - 1 - f % 2) / (2. * f);
-			for (int i = 0; i < blob->count(); ++i) {
-				float x = i % blob->width();
-				float y = (i / blob->width()) % blob->height();
-				data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
-			}
-			CHECK_EQ(this->filler_param_.sparse(), -1)
-					<< "Sparsity not supported by this Filler.";
-		}
+  public:
+    explicit BilinearFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
+      CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+      Dtype* data = blob->mutable_cpu_data();
+      int f = ceil(blob->width() / 2.);
+      float c = (2 * f - 1 - f % 2) / (2. * f);
+      for (int i = 0; i < blob->count(); ++i) {
+        float x = i % blob->width();
+        float y = (i / blob->width()) % blob->height();
+        data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+      }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /**
@@ -289,25 +281,25 @@ class BilinearFiller: public Filler<Dtype> {
  */
 template <typename Dtype>
 Filler<Dtype>* GetFiller(const FillerParameter& param) {
-	const std::string& type = param.type();
-	if (type == "constant") {
-		return new ConstantFiller<Dtype>(param);
-	} else if (type == "gaussian") {
-		return new GaussianFiller<Dtype>(param);
-	} else if (type == "positive_unitball") {
-		return new PositiveUnitballFiller<Dtype>(param);
-	} else if (type == "uniform") {
-		return new UniformFiller<Dtype>(param);
-	} else if (type == "xavier") {
-		return new XavierFiller<Dtype>(param);
-	} else if (type == "msra") {
-		return new MSRAFiller<Dtype>(param);
-	} else if (type == "bilinear") {
-		return new BilinearFiller<Dtype>(param);
-	} else {
-		CHECK(false) << "Unknown filler name: " << param.type();
-	}
-	return (Filler<Dtype>*) (NULL);
+  const std::string& type = param.type();
+  if (type == "constant") {
+    return new ConstantFiller<Dtype>(param);
+  } else if (type == "gaussian") {
+    return new GaussianFiller<Dtype>(param);
+  } else if (type == "positive_unitball") {
+    return new PositiveUnitballFiller<Dtype>(param);
+  } else if (type == "uniform") {
+    return new UniformFiller<Dtype>(param);
+  } else if (type == "xavier") {
+    return new XavierFiller<Dtype>(param);
+  } else if (type == "msra") {
+    return new MSRAFiller<Dtype>(param);
+  } else if (type == "bilinear") {
+    return new BilinearFiller<Dtype>(param);
+  } else {
+    CHECK(false) << "Unknown filler name: " << param.type();
+  }
+  return (Filler<Dtype>*) (NULL);
 }
 
 }  // namespace caffe
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 677deea4..dd8ae8bf 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -19,28 +19,27 @@ namespace caffe {
  * by reimplementing the virutal function InternalThreadEntry.
  */
 class InternalThread {
-	public:
-		InternalThread()
-		:
-				thread_() {
-		}
-		virtual ~InternalThread();
+  public:
+    InternalThread()
+        : thread_() {
+    }
+    virtual ~InternalThread();
 
-		/** Returns true if the thread was successfully started. **/
-		bool StartInternalThread();
+    /** Returns true if the thread was successfully started. **/
+    bool StartInternalThread();
 
-		/** Will not return until the internal thread has exited. */
-		bool WaitForInternalThreadToExit();
+    /** Will not return until the internal thread has exited. */
+    bool WaitForInternalThreadToExit();
 
-		bool is_started() const;
+    bool is_started() const;
 
-	protected:
-		/* Implement this method in your subclass
-		 with the code you want your thread to run. */
-		virtual void InternalThreadEntry() {
-		}
+  protected:
+    /* Implement this method in your subclass
+     with the code you want your thread to run. */
+    virtual void InternalThreadEntry() {
+    }
 
-		shared_ptr<boost::thread> thread_;
+    shared_ptr<boost::thread> thread_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 5651e814..c346ede1 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -25,403 +25,400 @@ namespace caffe {
  */
 template <typename Dtype>
 class Layer {
-	public:
-		/**
-		 * You should not implement your own constructor. Any set up code should go
-		 * to SetUp(), where the dimensions of the bottom blobs are provided to the
-		 * layer.
-		 */
-		explicit Layer(const LayerParameter& param)
-		:
-				layer_param_(param) {
-			// Set phase and copy blobs (if there are any).
-			phase_ = param.phase();
-			if (layer_param_.blobs_size() > 0) {
-				blobs_.resize(layer_param_.blobs_size());
-				for (int i = 0; i < layer_param_.blobs_size(); ++i) {
-					blobs_[i].reset(new Blob<Dtype>());
-					blobs_[i]->FromProto(layer_param_.blobs(i));
-				}
-			}
-		}
-		virtual ~Layer() {
-		}
+  public:
+    /**
+     * You should not implement your own constructor. Any set up code should go
+     * to SetUp(), where the dimensions of the bottom blobs are provided to the
+     * layer.
+     */
+    explicit Layer(const LayerParameter& param)
+        : layer_param_(param) {
+      // Set phase and copy blobs (if there are any).
+      phase_ = param.phase();
+      if (layer_param_.blobs_size() > 0) {
+        blobs_.resize(layer_param_.blobs_size());
+        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
+          blobs_[i].reset(new Blob<Dtype>());
+          blobs_[i]->FromProto(layer_param_.blobs(i));
+        }
+      }
+    }
+    virtual ~Layer() {
+    }
 
-		/**
-		 * @brief Implements common layer setup functionality.
-		 *
-		 * @param bottom the preshaped input blobs
-		 * @param top
-		 *     the allocated but unshaped output blobs, to be shaped by Reshape
-		 *
-		 * Checks that the number of bottom and top blobs is correct.
-		 * Calls LayerSetUp to do special layer setup for individual layer types,
-		 * followed by Reshape to set up sizes of top blobs and internal buffers.
-		 * Sets up the loss weight multiplier blobs for any non-zero loss weights.
-		 * This method may not be overridden.
-		 */
-		void SetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			CheckBlobCounts(bottom, top);
-			LayerSetUp(bottom, top);
-			Reshape(bottom, top);
-			SetLossWeights(top);
-		}
+    /**
+     * @brief Implements common layer setup functionality.
+     *
+     * @param bottom the preshaped input blobs
+     * @param top
+     *     the allocated but unshaped output blobs, to be shaped by Reshape
+     *
+     * Checks that the number of bottom and top blobs is correct.
+     * Calls LayerSetUp to do special layer setup for individual layer types,
+     * followed by Reshape to set up sizes of top blobs and internal buffers.
+     * Sets up the loss weight multiplier blobs for any non-zero loss weights.
+     * This method may not be overridden.
+     */
+    void SetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      CheckBlobCounts(bottom, top);
+      LayerSetUp(bottom, top);
+      Reshape(bottom, top);
+      SetLossWeights(top);
+    }
 
-		/**
-		 * @brief Does layer-specific setup: your layer should implement this function
-		 *        as well as Reshape.
-		 *
-		 * @param bottom
-		 *     the preshaped input blobs, whose data fields store the input data for
-		 *     this layer
-		 * @param top
-		 *     the allocated but unshaped output blobs
-		 *
-		 * This method should do one-time layer specific setup. This includes reading
-		 * and processing relevent parameters from the <code>layer_param_</code>.
-		 * Setting up the shapes of top blobs and internal buffers should be done in
-		 * <code>Reshape</code>, which will be called before the forward pass to
-		 * adjust the top blob sizes.
-		 */
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-		}
+    /**
+     * @brief Does layer-specific setup: your layer should implement this function
+     *        as well as Reshape.
+     *
+     * @param bottom
+     *     the preshaped input blobs, whose data fields store the input data for
+     *     this layer
+     * @param top
+     *     the allocated but unshaped output blobs
+     *
+     * This method should do one-time layer specific setup. This includes reading
+     * and processing relevent parameters from the <code>layer_param_</code>.
+     * Setting up the shapes of top blobs and internal buffers should be done in
+     * <code>Reshape</code>, which will be called before the forward pass to
+     * adjust the top blob sizes.
+     */
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
 
-		/**
-		 * @brief Adjust the shapes of top blobs and internal buffers to accomodate
-		 *        the shapes of the bottom blobs.
-		 *
-		 * @param bottom the input blobs, with the requested input shapes
-		 * @param top the top blobs, which should be reshaped as needed
-		 *
-		 * This method should reshape top blobs as needed according to the shapes
-		 * of the bottom (input) blobs, as well as reshaping any internal buffers
-		 * and making any other necessary adjustments so that the layer can
-		 * accomodate the bottom blobs.
-		 */
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) = 0;
+    /**
+     * @brief Adjust the shapes of top blobs and internal buffers to accomodate
+     *        the shapes of the bottom blobs.
+     *
+     * @param bottom the input blobs, with the requested input shapes
+     * @param top the top blobs, which should be reshaped as needed
+     *
+     * This method should reshape top blobs as needed according to the shapes
+     * of the bottom (input) blobs, as well as reshaping any internal buffers
+     * and making any other necessary adjustments so that the layer can
+     * accomodate the bottom blobs.
+     */
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) = 0;
 
-		/**
-		 * @brief Given the bottom blobs, compute the top blobs and the loss.
-		 *
-		 * @param bottom
-		 *     the input blobs, whose data fields store the input data for this layer
-		 * @param top
-		 *     the preshaped output blobs, whose data fields will store this layers'
-		 *     outputs
-		 * \return The total loss from the layer.
-		 *
-		 * The Forward wrapper calls the relevant device wrapper function
-		 * (Forward_cpu or Forward_gpu) to compute the top blob values given the
-		 * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
-		 * then computes and returns the loss.
-		 *
-		 * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
-		 */
-		inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
+    /**
+     * @brief Given the bottom blobs, compute the top blobs and the loss.
+     *
+     * @param bottom
+     *     the input blobs, whose data fields store the input data for this layer
+     * @param top
+     *     the preshaped output blobs, whose data fields will store this layers'
+     *     outputs
+     * \return The total loss from the layer.
+     *
+     * The Forward wrapper calls the relevant device wrapper function
+     * (Forward_cpu or Forward_gpu) to compute the top blob values given the
+     * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
+     * then computes and returns the loss.
+     *
+     * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
+     */
+    inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
 
-		/**
-		 * @brief Given the top blob error gradients, compute the bottom blob error
-		 *        gradients.
-		 *
-		 * @param top
-		 *     the output blobs, whose diff fields store the gradient of the error
-		 *     with respect to themselves
-		 * @param propagate_down
-		 *     a vector with equal length to bottom, with each index indicating
-		 *     whether to propagate the error gradients down to the bottom blob at
-		 *     the corresponding index
-		 * @param bottom
-		 *     the input blobs, whose diff fields will store the gradient of the error
-		 *     with respect to themselves after Backward is run
-		 *
-		 * The Backward wrapper calls the relevant device wrapper function
-		 * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
-		 * top blob diffs.
-		 *
-		 * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
-		 */
-		inline void Backward(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom);
+    /**
+     * @brief Given the top blob error gradients, compute the bottom blob error
+     *        gradients.
+     *
+     * @param top
+     *     the output blobs, whose diff fields store the gradient of the error
+     *     with respect to themselves
+     * @param propagate_down
+     *     a vector with equal length to bottom, with each index indicating
+     *     whether to propagate the error gradients down to the bottom blob at
+     *     the corresponding index
+     * @param bottom
+     *     the input blobs, whose diff fields will store the gradient of the error
+     *     with respect to themselves after Backward is run
+     *
+     * The Backward wrapper calls the relevant device wrapper function
+     * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
+     * top blob diffs.
+     *
+     * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
+     */
+    inline void Backward(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-		/**
-		 * @brief Returns the vector of learnable parameter blobs.
-		 */
-		vector<shared_ptr<Blob<Dtype> > >& blobs() {
-			return blobs_;
-		}
+    /**
+     * @brief Returns the vector of learnable parameter blobs.
+     */
+    vector<shared_ptr<Blob<Dtype> > >& blobs() {
+      return blobs_;
+    }
 
-		/**
-		 * @brief Returns the layer parameter.
-		 */
-		const LayerParameter& layer_param() const {
-			return layer_param_;
-		}
+    /**
+     * @brief Returns the layer parameter.
+     */
+    const LayerParameter& layer_param() const {
+      return layer_param_;
+    }
 
-		/**
-		 * @brief Writes the layer parameter to a protocol buffer
-		 */
-		virtual void ToProto(LayerParameter* param, bool write_diff = false);
+    /**
+     * @brief Writes the layer parameter to a protocol buffer
+     */
+    virtual void ToProto(LayerParameter* param, bool write_diff = false);
 
-		/**
-		 * @brief Returns the scalar loss associated with a top blob at a given index.
-		 */
-		inline Dtype loss(const int top_index) const {
-			return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
-		}
+    /**
+     * @brief Returns the scalar loss associated with a top blob at a given index.
+     */
+    inline Dtype loss(const int top_index) const {
+      return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
+    }
 
-		/**
-		 * @brief Sets the loss associated with a top blob at a given index.
-		 */
-		inline void set_loss(const int top_index, const Dtype value) {
-			if (loss_.size() <= top_index) {
-				loss_.resize(top_index + 1, Dtype(0));
-			}
-			loss_[top_index] = value;
-		}
+    /**
+     * @brief Sets the loss associated with a top blob at a given index.
+     */
+    inline void set_loss(const int top_index, const Dtype value) {
+      if (loss_.size() <= top_index) {
+        loss_.resize(top_index + 1, Dtype(0));
+      }
+      loss_[top_index] = value;
+    }
 
-		/**
-		 * @brief Returns the layer type.
-		 */
-		virtual inline const char* type() const {
-			return "";
-		}
+    /**
+     * @brief Returns the layer type.
+     */
+    virtual inline const char* type() const {
+      return "";
+    }
 
-		/**
-		 * @brief Returns the exact number of bottom blobs required by the layer,
-		 *        or -1 if no exact number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some exact number of bottom blobs.
-		 */
-		virtual inline int ExactNumBottomBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns the minimum number of bottom blobs required by the layer,
-		 *        or -1 if no minimum number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some minimum number of bottom blobs.
-		 */
-		virtual inline int MinBottomBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns the maximum number of bottom blobs required by the layer,
-		 *        or -1 if no maximum number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some maximum number of bottom blobs.
-		 */
-		virtual inline int MaxBottomBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns the exact number of top blobs required by the layer,
-		 *        or -1 if no exact number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some exact number of top blobs.
-		 */
-		virtual inline int ExactNumTopBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns the minimum number of top blobs required by the layer,
-		 *        or -1 if no minimum number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some minimum number of top blobs.
-		 */
-		virtual inline int MinTopBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns the maximum number of top blobs required by the layer,
-		 *        or -1 if no maximum number is required.
-		 *
-		 * This method should be overridden to return a non-negative value if your
-		 * layer expects some maximum number of top blobs.
-		 */
-		virtual inline int MaxTopBlobs() const {
-			return -1;
-		}
-		/**
-		 * @brief Returns true if the layer requires an equal number of bottom and
-		 *        top blobs.
-		 *
-		 * This method should be overridden to return true if your layer expects an
-		 * equal number of bottom and top blobs.
-		 */
-		virtual inline bool EqualNumBottomTopBlobs() const {
-			return false;
-		}
+    /**
+     * @brief Returns the exact number of bottom blobs required by the layer,
+     *        or -1 if no exact number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some exact number of bottom blobs.
+     */
+    virtual inline int ExactNumBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the minimum number of bottom blobs required by the layer,
+     *        or -1 if no minimum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some minimum number of bottom blobs.
+     */
+    virtual inline int MinBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the maximum number of bottom blobs required by the layer,
+     *        or -1 if no maximum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some maximum number of bottom blobs.
+     */
+    virtual inline int MaxBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the exact number of top blobs required by the layer,
+     *        or -1 if no exact number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some exact number of top blobs.
+     */
+    virtual inline int ExactNumTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the minimum number of top blobs required by the layer,
+     *        or -1 if no minimum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some minimum number of top blobs.
+     */
+    virtual inline int MinTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the maximum number of top blobs required by the layer,
+     *        or -1 if no maximum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some maximum number of top blobs.
+     */
+    virtual inline int MaxTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns true if the layer requires an equal number of bottom and
+     *        top blobs.
+     *
+     * This method should be overridden to return true if your layer expects an
+     * equal number of bottom and top blobs.
+     */
+    virtual inline bool EqualNumBottomTopBlobs() const {
+      return false;
+    }
 
-		/**
-		 * @brief Return whether "anonymous" top blobs are created automatically
-		 *        by the layer.
-		 *
-		 * If this method returns true, Net::Init will create enough "anonymous" top
-		 * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
-		 * MinTopBlobs().
-		 */
-		virtual inline bool AutoTopBlobs() const {
-			return false;
-		}
+    /**
+     * @brief Return whether "anonymous" top blobs are created automatically
+     *        by the layer.
+     *
+     * If this method returns true, Net::Init will create enough "anonymous" top
+     * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
+     * MinTopBlobs().
+     */
+    virtual inline bool AutoTopBlobs() const {
+      return false;
+    }
 
-		/**
-		 * @brief Return whether to allow force_backward for a given bottom blob
-		 *        index.
-		 *
-		 * If AllowForceBackward(i) == false, we will ignore the force_backward
-		 * setting and backpropagate to blob i only if it needs gradient information
-		 * (as is done when force_backward == false).
-		 */
-		virtual inline bool AllowForceBackward(const int bottom_index) const {
-			return true;
-		}
+    /**
+     * @brief Return whether to allow force_backward for a given bottom blob
+     *        index.
+     *
+     * If AllowForceBackward(i) == false, we will ignore the force_backward
+     * setting and backpropagate to blob i only if it needs gradient information
+     * (as is done when force_backward == false).
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return true;
+    }
 
-		/**
-		 * @brief Specifies whether the layer should compute gradients w.r.t. a
-		 *        parameter at a particular index given by param_id.
-		 *
-		 * You can safely ignore false values and always compute gradients
-		 * for all parameters, but possibly with wasteful computation.
-		 */
-		inline bool param_propagate_down(const int param_id) {
-			return
-			(param_propagate_down_.size() > param_id) ?
-					param_propagate_down_[param_id] : false;
-		}
-		/**
-		 * @brief Sets whether the layer should compute gradients w.r.t. a
-		 *        parameter at a particular index given by param_id.
-		 */
-		inline void set_param_propagate_down(const int param_id, const bool value) {
-			if (param_propagate_down_.size() <= param_id) {
-				param_propagate_down_.resize(param_id + 1, true);
-			}
-			param_propagate_down_[param_id] = value;
-		}
+    /**
+     * @brief Specifies whether the layer should compute gradients w.r.t. a
+     *        parameter at a particular index given by param_id.
+     *
+     * You can safely ignore false values and always compute gradients
+     * for all parameters, but possibly with wasteful computation.
+     */
+    inline bool param_propagate_down(const int param_id) {
+      return
+          (param_propagate_down_.size() > param_id) ?
+              param_propagate_down_[param_id] : false;
+    }
+    /**
+     * @brief Sets whether the layer should compute gradients w.r.t. a
+     *        parameter at a particular index given by param_id.
+     */
+    inline void set_param_propagate_down(const int param_id, const bool value) {
+      if (param_propagate_down_.size() <= param_id) {
+        param_propagate_down_.resize(param_id + 1, true);
+      }
+      param_propagate_down_[param_id] = value;
+    }
 
-	protected:
-		/** The protobuf that stores the layer parameters */
-		LayerParameter layer_param_;
-		/** The phase: TRAIN or TEST */
-		Phase phase_;
-		/** The vector that stores the learnable parameters as a set of blobs. */
-		vector<shared_ptr<Blob<Dtype> > > blobs_;
-		/** Vector indicating whether to compute the diff of each param blob. */
-		vector<bool> param_propagate_down_;
+  protected:
+    /** The protobuf that stores the layer parameters */
+    LayerParameter layer_param_;
+    /** The phase: TRAIN or TEST */
+    Phase phase_;
+    /** The vector that stores the learnable parameters as a set of blobs. */
+    vector<shared_ptr<Blob<Dtype> > > blobs_;
+    /** Vector indicating whether to compute the diff of each param blob. */
+    vector<bool> param_propagate_down_;
 
-		/** The vector that indicates whether each top blob has a non-zero weight in
-		 *  the objective function. */
-		vector<Dtype> loss_;
+    /** The vector that indicates whether each top blob has a non-zero weight in
+     *  the objective function. */
+    vector<Dtype> loss_;
 
-		/** @brief Using the CPU device, compute the layer output. */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) = 0;
-		/**
-		 * @brief Using the GPU device, compute the layer output.
-		 *        Fall back to Forward_cpu() if unavailable.
-		 */
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			// LOG(WARNING) << "Using CPU code as backup.";
-			return Forward_cpu(bottom, top);
-		}
+    /** @brief Using the CPU device, compute the layer output. */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) = 0;
+    /**
+     * @brief Using the GPU device, compute the layer output.
+     *        Fall back to Forward_cpu() if unavailable.
+     */
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      // LOG(WARNING) << "Using CPU code as backup.";
+      return Forward_cpu(bottom, top);
+    }
 
-		/**
-		 * @brief Using the CPU device, compute the gradients for any parameters and
-		 *        for the bottom blobs if propagate_down is true.
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) = 0;
-		/**
-		 * @brief Using the GPU device, compute the gradients for any parameters and
-		 *        for the bottom blobs if propagate_down is true.
-		 *        Fall back to Backward_cpu() if unavailable.
-		 */
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-			// LOG(WARNING) << "Using CPU code as backup.";
-			Backward_cpu(top, propagate_down, bottom);
-		}
+    /**
+     * @brief Using the CPU device, compute the gradients for any parameters and
+     *        for the bottom blobs if propagate_down is true.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) = 0;
+    /**
+     * @brief Using the GPU device, compute the gradients for any parameters and
+     *        for the bottom blobs if propagate_down is true.
+     *        Fall back to Backward_cpu() if unavailable.
+     */
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      // LOG(WARNING) << "Using CPU code as backup.";
+      Backward_cpu(top, propagate_down, bottom);
+    }
 
-		/**
-		 * Called by the parent Layer's SetUp to check that the number of bottom
-		 * and top Blobs provided as input match the expected numbers specified by
-		 * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
-		 */
-		virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			if (ExactNumBottomBlobs() >= 0) {
-				CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
-						<< type() << " Layer takes " << ExactNumBottomBlobs()
-						<< " bottom blob(s) as input.";
-			}
-			if (MinBottomBlobs() >= 0) {
-				CHECK_LE(MinBottomBlobs(), bottom.size())
-						<< type() << " Layer takes at least " << MinBottomBlobs()
-						<< " bottom blob(s) as input.";
-			}
-			if (MaxBottomBlobs() >= 0) {
-				CHECK_GE(MaxBottomBlobs(), bottom.size())
-						<< type() << " Layer takes at most " << MaxBottomBlobs()
-						<< " bottom blob(s) as input.";
-			}
-			if (ExactNumTopBlobs() >= 0) {
-				CHECK_EQ(ExactNumTopBlobs(), top.size())
-						<< type() << " Layer produces " << ExactNumTopBlobs()
-						<< " top blob(s) as output.";
-			}
-			if (MinTopBlobs() >= 0) {
-				CHECK_LE(MinTopBlobs(), top.size())
-						<< type() << " Layer produces at least " << MinTopBlobs()
-						<< " top blob(s) as output.";
-			}
-			if (MaxTopBlobs() >= 0) {
-				CHECK_GE(MaxTopBlobs(), top.size())
-						<< type() << " Layer produces at most " << MaxTopBlobs()
-						<< " top blob(s) as output.";
-			}
-			if (EqualNumBottomTopBlobs()) {
-				CHECK_EQ(bottom.size(), top.size())
-						<< type() << " Layer produces one top blob as output for each "
-						<< "bottom blob input.";
-			}
-		}
+    /**
+     * Called by the parent Layer's SetUp to check that the number of bottom
+     * and top Blobs provided as input match the expected numbers specified by
+     * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
+     */
+    virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      if (ExactNumBottomBlobs() >= 0) {
+        CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) << type()
+            << " Layer takes " << ExactNumBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (MinBottomBlobs() >= 0) {
+        CHECK_LE(MinBottomBlobs(), bottom.size()) << type()
+            << " Layer takes at least " << MinBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (MaxBottomBlobs() >= 0) {
+        CHECK_GE(MaxBottomBlobs(), bottom.size()) << type()
+            << " Layer takes at most " << MaxBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (ExactNumTopBlobs() >= 0) {
+        CHECK_EQ(ExactNumTopBlobs(), top.size()) << type() << " Layer produces "
+            << ExactNumTopBlobs() << " top blob(s) as output.";
+      }
+      if (MinTopBlobs() >= 0) {
+        CHECK_LE(MinTopBlobs(), top.size()) << type()
+            << " Layer produces at least " << MinTopBlobs()
+            << " top blob(s) as output.";
+      }
+      if (MaxTopBlobs() >= 0) {
+        CHECK_GE(MaxTopBlobs(), top.size()) << type()
+            << " Layer produces at most " << MaxTopBlobs()
+            << " top blob(s) as output.";
+      }
+      if (EqualNumBottomTopBlobs()) {
+        CHECK_EQ(bottom.size(), top.size()) << type()
+            << " Layer produces one top blob as output for each "
+            << "bottom blob input.";
+      }
+    }
 
-		/**
-		 * Called by SetUp to initialize the weights associated with any top blobs in
-		 * the loss function. Store non-zero loss weights in the diff blob.
-		 */
-		inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
-			const int num_loss_weights = layer_param_.loss_weight_size();
-			if (num_loss_weights) {
-				CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
-						"unspecified or specified once per top blob.";
-				for (int top_id = 0; top_id < top.size(); ++top_id) {
-					const Dtype loss_weight = layer_param_.loss_weight(top_id);
-					if (loss_weight == Dtype(0)) {
-						continue;
-					}
-					this->set_loss(top_id, loss_weight);
-					const int count = top[top_id]->count();
-					Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
-					caffe_set(count, loss_weight, loss_multiplier);
-				}
-			}
-		}
+    /**
+     * Called by SetUp to initialize the weights associated with any top blobs in
+     * the loss function. Store non-zero loss weights in the diff blob.
+     */
+    inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
+      const int num_loss_weights = layer_param_.loss_weight_size();
+      if (num_loss_weights) {
+        CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
+            "unspecified or specified once per top blob.";
+        for (int top_id = 0; top_id < top.size(); ++top_id) {
+          const Dtype loss_weight = layer_param_.loss_weight(top_id);
+          if (loss_weight == Dtype(0)) {
+            continue;
+          }
+          this->set_loss(top_id, loss_weight);
+          const int count = top[top_id]->count();
+          Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
+          caffe_set(count, loss_weight, loss_multiplier);
+        }
+      }
+    }
 
-		DISABLE_COPY_AND_ASSIGN (Layer);
+    DISABLE_COPY_AND_ASSIGN (Layer);
 };
 // class Layer
 
@@ -430,69 +427,68 @@ class Layer {
 // functions.
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	Dtype loss = 0;
-	Reshape(bottom, top);
-	switch (Caffe::mode()) {
-		case Caffe::CPU:
-			Forward_cpu(bottom, top);
-			for (int top_id = 0; top_id < top.size(); ++top_id) {
-				if (!this->loss(top_id)) {
-					continue;
-				}
-				const int count = top[top_id]->count();
-				const Dtype* data = top[top_id]->cpu_data();
-				const Dtype* loss_weights = top[top_id]->cpu_diff();
-				loss += caffe_cpu_dot(count, data, loss_weights);
-			}
-			break;
-		case Caffe::GPU:
-			Forward_gpu(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  Dtype loss = 0;
+  Reshape(bottom, top);
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Forward_cpu(bottom, top);
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) {
+        continue;
+      }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->cpu_data();
+      const Dtype* loss_weights = top[top_id]->cpu_diff();
+      loss += caffe_cpu_dot(count, data, loss_weights);
+    }
+    break;
+  case Caffe::GPU:
+    Forward_gpu(bottom, top);
 #ifndef CPU_ONLY
-			for (int top_id = 0; top_id < top.size(); ++top_id) {
-				if (!this->loss(top_id)) {
-					continue;
-				}
-				const int count = top[top_id]->count();
-				const Dtype* data = top[top_id]->gpu_data();
-				const Dtype* loss_weights = top[top_id]->gpu_diff();
-				Dtype blob_loss = 0;
-				caffe_gpu_dot(count, data, loss_weights, &blob_loss);
-				loss += blob_loss;
-			}
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) {
+        continue;
+      }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->gpu_data();
+      const Dtype* loss_weights = top[top_id]->gpu_diff();
+      Dtype blob_loss = 0;
+      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
+      loss += blob_loss;
+    }
 #endif
-			break;
-		default:
-			LOG(FATAL) << "Unknown caffe mode.";
-	}
-	return loss;
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
+  return loss;
 }
 
 template <typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	switch (Caffe::mode()) {
-		case Caffe::CPU:
-			Backward_cpu(top, propagate_down, bottom);
-			break;
-		case Caffe::GPU:
-			Backward_gpu(top, propagate_down, bottom);
-			break;
-		default:
-			LOG(FATAL) << "Unknown caffe mode.";
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Backward_cpu(top, propagate_down, bottom);
+    break;
+  case Caffe::GPU:
+    Backward_gpu(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
 }
 
 // Serialize LayerParameter to protocol buffer
 template <typename Dtype>
 void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
-	param->Clear();
-	param->CopyFrom(layer_param_);
-	param->clear_blobs();
-	for (int i = 0; i < blobs_.size(); ++i) {
-		blobs_[i]->ToProto(param->add_blobs(), write_diff);
-	}
+  param->Clear();
+  param->CopyFrom(layer_param_);
+  param->clear_blobs();
+  for (int i = 0; i < blobs_.size(); ++i) {
+    blobs_[i]->ToProto(param->add_blobs(), write_diff);
+  }
 }
 
 }  // namespace caffe
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index b64b9eb2..6da8d315 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -52,61 +52,61 @@ class Layer;
 
 template <typename Dtype>
 class LayerRegistry {
-	public:
-		typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
-		typedef std::map<string, Creator> CreatorRegistry;
-
-		static CreatorRegistry& Registry() {
-			static CreatorRegistry* g_registry_ = new CreatorRegistry();
-			return *g_registry_;
-		}
-
-		// Adds a creator.
-		static void AddCreator(const string& type, Creator creator) {
-			CreatorRegistry& registry = Registry();
-			CHECK_EQ(registry.count(type), 0)
-					<< "Layer type " << type << " already registered.";
-			registry[type] = creator;
-		}
-
-		// Get a layer using a LayerParameter.
-		static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-			LOG(INFO) << "Creating layer " << param.name();
-			const string& type = param.type();
-			CreatorRegistry& registry = Registry();
-			CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
-					<< " (known types: " << LayerTypeList() << ")";
-			return registry[type](param);
-		}
-
-	private:
-		// Layer registry should never be instantiated - everything is done with its
-		// static variables.
-		LayerRegistry() {
-		}
-
-		static string LayerTypeList() {
-			CreatorRegistry& registry = Registry();
-			string layer_types;
-			for (typename CreatorRegistry::iterator iter = registry.begin();
-					iter != registry.end(); ++iter) {
-				if (iter != registry.begin()) {
-					layer_types += ", ";
-				}
-				layer_types += iter->first;
-			}
-			return layer_types;
-		}
+  public:
+    typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
+    typedef std::map<string, Creator> CreatorRegistry;
+
+    static CreatorRegistry& Registry() {
+      static CreatorRegistry* g_registry_ = new CreatorRegistry();
+      return *g_registry_;
+    }
+
+    // Adds a creator.
+    static void AddCreator(const string& type, Creator creator) {
+      CreatorRegistry& registry = Registry();
+      CHECK_EQ(registry.count(type), 0) << "Layer type " << type
+          << " already registered.";
+      registry[type] = creator;
+    }
+
+    // Get a layer using a LayerParameter.
+    static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
+      LOG(INFO) << "Creating layer " << param.name();
+      const string& type = param.type();
+      CreatorRegistry& registry = Registry();
+      CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
+          << " (known types: " << LayerTypeList() << ")";
+      return registry[type](param);
+    }
+
+  private:
+    // Layer registry should never be instantiated - everything is done with its
+    // static variables.
+    LayerRegistry() {
+    }
+
+    static string LayerTypeList() {
+      CreatorRegistry& registry = Registry();
+      string layer_types;
+      for (typename CreatorRegistry::iterator iter = registry.begin();
+          iter != registry.end(); ++iter) {
+        if (iter != registry.begin()) {
+          layer_types += ", ";
+        }
+        layer_types += iter->first;
+      }
+      return layer_types;
+    }
 };
 
 template <typename Dtype>
 class LayerRegisterer {
-	public:
-		LayerRegisterer(const string& type,
-				shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
-			// LOG(INFO) << "Registering layer type: " << type;
-			LayerRegistry<Dtype>::AddCreator(type, creator);
-		}
+  public:
+    LayerRegisterer(const string& type,
+        shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
+      // LOG(INFO) << "Registering layer type: " << type;
+      LayerRegistry<Dtype>::AddCreator(type, creator);
+    }
 };
 
 #define REGISTER_LAYER_CREATOR(type, creator)                                  \
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 766645b5..431bd8ea 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -21,81 +21,80 @@ const float kLOG_THRESHOLD = 1e-20;
  */
 template <typename Dtype>
 class AccuracyLayer: public Layer<Dtype> {
-	public:
-		/**
-		 * @param param provides AccuracyParameter accuracy_param,
-		 *     with AccuracyLayer options:
-		 *   - top_k (\b optional, default 1).
-		 *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
-		 *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
-		 *     correct if the correct label is among the top 5 predicted labels.
-		 */
-		explicit AccuracyLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Accuracy";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$ x @f$, a Blob with values in
-		 *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
-		 *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
-		 *      label @f$ \hat{l}_n @f$ given by its maximal index:
-		 *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels @f$ l @f$, an integer-valued Blob with values
-		 *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
-		 *      indicating the correct class label among the @f$ K @f$ classes
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      the computed accuracy: @f$
-		 *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
-		 *      @f$, where @f$
-		 *      \delta\{\mathrm{condition}\} = \left\{
-		 *         \begin{array}{lr}
-		 *            1 & \mbox{if condition} \\
+  public:
+    /**
+     * @param param provides AccuracyParameter accuracy_param,
+     *     with AccuracyLayer options:
+     *   - top_k (\b optional, default 1).
+     *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
+     *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
+     *     correct if the correct label is among the top 5 predicted labels.
+     */
+    explicit AccuracyLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Accuracy";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ x @f$, a Blob with values in
+     *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+     *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
+     *      label @f$ \hat{l}_n @f$ given by its maximal index:
+     *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels @f$ l @f$, an integer-valued Blob with values
+     *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+     *      indicating the correct class label among the @f$ K @f$ classes
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      the computed accuracy: @f$
+     *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
+     *      @f$, where @f$
+     *      \delta\{\mathrm{condition}\} = \left\{
+     *         \begin{array}{lr}
+     *            1 & \mbox{if condition} \\
    *            0 & \mbox{otherwise}
-		 *         \end{array} \right.
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-			for (int i = 0; i < propagate_down.size(); ++i) {
-				if (propagate_down[i]) {
-					NOT_IMPLEMENTED;
-				}
-			}
-		}
-
-		int label_axis_, outer_num_, inner_num_;
-
-		int top_k_;
-
-		/// Whether to ignore instances with a certain label.
-		bool has_ignore_label_;
-		/// The label indicating that an instance should be ignored.
-		int ignore_label_;
+     *         \end{array} \right.
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      for (int i = 0; i < propagate_down.size(); ++i) {
+        if (propagate_down[i]) {
+          NOT_IMPLEMENTED;
+        }
+      }
+    }
+
+    int label_axis_, outer_num_, inner_num_;
+
+    int top_k_;
+
+    /// Whether to ignore instances with a certain label.
+    bool has_ignore_label_;
+    /// The label indicating that an instance should be ignored.
+    int ignore_label_;
 };
 
 /**
@@ -108,39 +107,38 @@ class AccuracyLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class LossLayer: public Layer<Dtype> {
-	public:
-		explicit LossLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(
-				const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(
-				const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-
-		virtual inline int ExactNumBottomBlobs() const {
-			return 2;
-		}
-
-		/**
-		 * @brief For convenience and backwards compatibility, instruct the Net to
-		 *        automatically allocate a single top Blob for LossLayers, into which
-		 *        they output their singleton loss, (even if the user didn't specify
-		 *        one in the prototxt, etc.).
-		 */
-		virtual inline bool AutoTopBlobs() const {
-			return true;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-		/**
-		 * We usually cannot backpropagate to the labels; ignore force_backward for
-		 * these inputs.
-		 */
-		virtual inline bool AllowForceBackward(const int bottom_index) const {
-			return bottom_index != 1;
-		}
+  public:
+    explicit LossLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+
+    /**
+     * @brief For convenience and backwards compatibility, instruct the Net to
+     *        automatically allocate a single top Blob for LossLayers, into which
+     *        they output their singleton loss, (even if the user didn't specify
+     *        one in the prototxt, etc.).
+     */
+    virtual inline bool AutoTopBlobs() const {
+      return true;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+    /**
+     * We usually cannot backpropagate to the labels; ignore force_backward for
+     * these inputs.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return bottom_index != 1;
+    }
 };
 
 /**
@@ -169,69 +167,68 @@ class LossLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ContrastiveLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit ContrastiveLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param), diff_() {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline int ExactNumBottomBlobs() const {
-			return 3;
-		}
-		virtual inline const char* type() const {
-			return "ContrastiveLoss";
-		}
-		/**
-		 * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
-		 * to the first two inputs.
-		 */
-		virtual inline bool AllowForceBackward(const int bottom_index) const {
-			return bottom_index != 2;
-		}
-
-	protected:
-		/// @copydoc ContrastiveLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the Contrastive error gradient w.r.t. the inputs.
-		 *
-		 * Computes the gradients with respect to the two input vectors (bottom[0] and
-		 * bottom[1]), but not the similarity label (bottom[2]).
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times 1 \times 1) @f$
-		 *      the features @f$a@f$; Backward fills their diff with
-		 *      gradients if propagate_down[0]
-		 *   -# @f$ (N \times C \times 1 \times 1) @f$
-		 *      the features @f$b@f$; Backward fills their diff with gradients if
-		 *      propagate_down[1]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Blob<Dtype> diff_;  // cached for backward pass
-		Blob<Dtype> dist_sq_;  // cached for backward pass
-		Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
-		Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+  public:
+    explicit ContrastiveLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), diff_() {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 3;
+    }
+    virtual inline const char* type() const {
+      return "ContrastiveLoss";
+    }
+    /**
+     * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
+     * to the first two inputs.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return bottom_index != 2;
+    }
+
+  protected:
+    /// @copydoc ContrastiveLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the Contrastive error gradient w.r.t. the inputs.
+     *
+     * Computes the gradients with respect to the two input vectors (bottom[0] and
+     * bottom[1]), but not the similarity label (bottom[2]).
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times 1 \times 1) @f$
+     *      the features @f$a@f$; Backward fills their diff with
+     *      gradients if propagate_down[0]
+     *   -# @f$ (N \times C \times 1 \times 1) @f$
+     *      the features @f$b@f$; Backward fills their diff with gradients if
+     *      propagate_down[1]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> diff_;  // cached for backward pass
+    Blob<Dtype> dist_sq_;  // cached for backward pass
+    Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+    Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
 };
 
 /**
@@ -262,71 +259,70 @@ class ContrastiveLossLayer: public LossLayer<Dtype> {
  */
 template <typename Dtype>
 class EuclideanLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit EuclideanLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param), diff_() {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "EuclideanLoss";
-		}
-		/**
-		 * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
-		 * to both inputs -- override to return true and always allow force_backward.
-		 */
-		virtual inline bool AllowForceBackward(const int bottom_index) const {
-			return true;
-		}
-
-	protected:
-		/// @copydoc EuclideanLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the Euclidean error gradient w.r.t. the inputs.
-		 *
-		 * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
-		 * gradients with respect to the label inputs bottom[1] (but still only will
-		 * if propagate_down[1] is set, due to being produced by learnable parameters
-		 * or if force_backward is set). In fact, this layer is "commutative" -- the
-		 * result is the same regardless of the order of the two bottoms.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$\hat{y}@f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial \hat{y}} =
-		 *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
-		 *      @f$ if propagate_down[0]
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the targets @f$y@f$; Backward fills their diff with gradients
-		 *      @f$ \frac{\partial E}{\partial y} =
-		 *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
-		 *      @f$ if propagate_down[1]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Blob<Dtype> diff_;
+  public:
+    explicit EuclideanLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), diff_() {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "EuclideanLoss";
+    }
+    /**
+     * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
+     * to both inputs -- override to return true and always allow force_backward.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return true;
+    }
+
+  protected:
+    /// @copydoc EuclideanLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the Euclidean error gradient w.r.t. the inputs.
+     *
+     * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
+     * gradients with respect to the label inputs bottom[1] (but still only will
+     * if propagate_down[1] is set, due to being produced by learnable parameters
+     * or if force_backward is set). In fact, this layer is "commutative" -- the
+     * result is the same regardless of the order of the two bottoms.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$\hat{y}@f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial \hat{y}} =
+     *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
+     *      @f$ if propagate_down[0]
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the targets @f$y@f$; Backward fills their diff with gradients
+     *      @f$ \frac{\partial E}{\partial y} =
+     *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
+     *      @f$ if propagate_down[1]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> diff_;
 };
 
 /**
@@ -374,50 +370,49 @@ class EuclideanLossLayer: public LossLayer<Dtype> {
  */
 template <typename Dtype>
 class HingeLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit HingeLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "HingeLoss";
-		}
-
-	protected:
-		/// @copydoc HingeLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the hinge loss error gradient w.r.t. the predictions.
-		 *
-		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-		 * if propagate_down[1] is set.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 *      propagate_down[1] must be false as we can't compute gradients with
-		 *      respect to the labels.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$t@f$; Backward computes diff
-		 *      @f$ \frac{\partial E}{\partial t} @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels -- ignored as we can't compute their error gradients
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit HingeLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "HingeLoss";
+    }
+
+  protected:
+    /// @copydoc HingeLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the hinge loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$t@f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial t} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -454,74 +449,73 @@ class HingeLossLayer: public LossLayer<Dtype> {
  */
 template <typename Dtype>
 class InfogainLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit InfogainLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param), infogain_() {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		// InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
-		// be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
-		// file specified by LayerParameter.)
-		virtual inline int ExactNumBottomBlobs() const {
-			return -1;
-		}
-		virtual inline int MinBottomBlobs() const {
-			return 2;
-		}
-		virtual inline int MaxBottomBlobs() const {
-			return 3;
-		}
-
-		virtual inline const char* type() const {
-			return "InfogainLoss";
-		}
-
-	protected:
-		/// @copydoc InfogainLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the infogain loss error gradient w.r.t. the predictions.
-		 *
-		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-		 * if propagate_down[1] is set. (The same applies to the infogain matrix, if
-		 * provided as bottom[2] rather than in the layer_param.)
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient
-		 *      with respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 *      propagate_down[1] must be false as we can't compute gradients with
-		 *      respect to the labels (similarly for propagate_down[2] and the
-		 *      infogain matrix, if provided as bottom[2])
-		 * @param bottom input Blob vector (length 2-3)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$ \hat{p} @f$; Backward computes diff
-		 *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels -- ignored as we can't compute their error gradients
-		 *   -# @f$ (1 \times 1 \times K \times K) @f$
-		 *      (\b optional) the information gain matrix -- ignored as its error
-		 *      gradient computation is not implemented.
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Blob<Dtype> infogain_;
+  public:
+    explicit InfogainLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), infogain_() {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
+    // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
+    // file specified by LayerParameter.)
+    virtual inline int ExactNumBottomBlobs() const {
+      return -1;
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int MaxBottomBlobs() const {
+      return 3;
+    }
+
+    virtual inline const char* type() const {
+      return "InfogainLoss";
+    }
+
+  protected:
+    /// @copydoc InfogainLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the infogain loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set. (The same applies to the infogain matrix, if
+     * provided as bottom[2] rather than in the layer_param.)
+     *
+     * @param top output Blob vector (length 1), providing the error gradient
+     *      with respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels (similarly for propagate_down[2] and the
+     *      infogain matrix, if provided as bottom[2])
+     * @param bottom input Blob vector (length 2-3)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ \hat{p} @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     *   -# @f$ (1 \times 1 \times K \times K) @f$
+     *      (\b optional) the information gain matrix -- ignored as its error
+     *      gradient computation is not implemented.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> infogain_;
 };
 
 /**
@@ -555,53 +549,52 @@ class InfogainLossLayer: public LossLayer<Dtype> {
  */
 template <typename Dtype>
 class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit MultinomialLogisticLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "MultinomialLogisticLoss";
-		}
-
-	protected:
-		/// @copydoc MultinomialLogisticLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the multinomial logistic loss error gradient w.r.t. the
-		 *        predictions.
-		 *
-		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-		 * if propagate_down[1] is set.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 *      propagate_down[1] must be false as we can't compute gradients with
-		 *      respect to the labels.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$ \hat{p} @f$; Backward computes diff
-		 *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels -- ignored as we can't compute their error gradients
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit MultinomialLogisticLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MultinomialLogisticLoss";
+    }
+
+  protected:
+    /// @copydoc MultinomialLogisticLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the multinomial logistic loss error gradient w.r.t. the
+     *        predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ \hat{p} @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -635,70 +628,68 @@ class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
  */
 template <typename Dtype>
 class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
-	public:
-		explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param),
-						sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
-						sigmoid_output_(new Blob<Dtype>()) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "SigmoidCrossEntropyLoss";
-		}
-
-	protected:
-		/// @copydoc SigmoidCrossEntropyLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
-		 *        predictions.
-		 *
-		 * Gradients cannot be computed with respect to the target inputs (bottom[1]),
-		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-		 * if propagate_down[1] is set.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 *      propagate_down[1] must be false as gradient computation with respect
-		 *      to the targets is not implemented.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$x@f$; Backward computes diff
-		 *      @f$ \frac{\partial E}{\partial x} =
-		 *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
-		 *      @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels -- ignored as we can't compute their error gradients
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		/// The internal SigmoidLayer used to map predictions to probabilities.
-		shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
-		/// sigmoid_output stores the output of the SigmoidLayer.
-		shared_ptr<Blob<Dtype> > sigmoid_output_;
-		/// bottom vector holder to call the underlying SigmoidLayer::Forward
-		vector<Blob<Dtype>*> sigmoid_bottom_vec_;
-		/// top vector holder to call the underlying SigmoidLayer::Forward
-		vector<Blob<Dtype>*> sigmoid_top_vec_;
+  public:
+    explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), sigmoid_layer_(
+            new SigmoidLayer<Dtype>(param)), sigmoid_output_(new Blob<Dtype>()) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SigmoidCrossEntropyLoss";
+    }
+
+  protected:
+    /// @copydoc SigmoidCrossEntropyLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
+     *        predictions.
+     *
+     * Gradients cannot be computed with respect to the target inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as gradient computation with respect
+     *      to the targets is not implemented.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$x@f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial x} =
+     *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
+     *      @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// The internal SigmoidLayer used to map predictions to probabilities.
+    shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+    /// sigmoid_output stores the output of the SigmoidLayer.
+    shared_ptr<Blob<Dtype> > sigmoid_output_;
+    /// bottom vector holder to call the underlying SigmoidLayer::Forward
+    vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+    /// top vector holder to call the underlying SigmoidLayer::Forward
+    vector<Blob<Dtype>*> sigmoid_top_vec_;
 };
 
 // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer.
@@ -735,100 +726,99 @@ template <typename Dtype> class SoftmaxLayer;
  */
 template <typename Dtype>
 class SoftmaxWithLossLayer: public LossLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides LossParameter loss_param, with options:
-		 *  - ignore_label (optional)
-		 *    Specify a label value that should be ignored when computing the loss.
-		 *  - normalize (optional, default true)
-		 *    If true, the loss is normalized by the number of (nonignored) labels
-		 *    present; otherwise the loss is simply summed over spatial locations.
-		 */
-		explicit SoftmaxWithLossLayer(const LayerParameter& param)
-		:
-				LossLayer<Dtype>(param) {
-		}
-		~SoftmaxWithLossLayer();
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "SoftmaxWithLoss";
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return -1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-		virtual inline int MaxTopBlobs() const {
-			return 2;
-		}
-
-	protected:
-		/// @copydoc SoftmaxWithLossLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		/**
-		 * @brief Computes the softmax loss error gradient w.r.t. the predictions.
-		 *
-		 * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-		 * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-		 * if propagate_down[1] is set.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-		 *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-		 *      as @f$ \lambda @f$ is the coefficient of this layer's output
-		 *      @f$\ell_i@f$ in the overall Net loss
-		 *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-		 *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-		 *      (*Assuming that this top Blob is not used as a bottom (input) by any
-		 *      other layer of the Net.)
-		 * @param propagate_down see Layer::Backward.
-		 *      propagate_down[1] must be false as we can't compute gradients with
-		 *      respect to the labels.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the predictions @f$ x @f$; Backward computes diff
-		 *      @f$ \frac{\partial E}{\partial x} @f$
-		 *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-		 *      the labels -- ignored as we can't compute their error gradients
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		void ocl_setup();
-
-		/// The internal SoftmaxLayer used to map predictions to a distribution.
-		shared_ptr<Layer<Dtype> > softmax_layer_;
-		/// prob stores the output probability predictions from the SoftmaxLayer.
-		Blob<Dtype> prob_;
-		/// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
-		vector<Blob<Dtype>*> softmax_bottom_vec_;
-		/// top vector holder used in call to the underlying SoftmaxLayer::Forward
-		vector<Blob<Dtype>*> softmax_top_vec_;
-		/// Whether to ignore instances with a certain label.
-		bool has_ignore_label_;
-		/// The label indicating that an instance should be ignored.
-		int ignore_label_;
-		/// Whether to normalize the loss by the total number of values present
-		/// (otherwise just by the batch size).
-		bool normalize_;
-
-		int softmax_axis_, outer_num_, inner_num_;
-
-	protected:
-		cl_kernel diff_kernel, scal_kernel, softmax_kernel;
-		cl_mem d_loss;
-		cl_kernel softmax_loss_fp_kernel;
-		cl_kernel softmax_loss_bp_kernel;
+  public:
+    /**
+     * @param param provides LossParameter loss_param, with options:
+     *  - ignore_label (optional)
+     *    Specify a label value that should be ignored when computing the loss.
+     *  - normalize (optional, default true)
+     *    If true, the loss is normalized by the number of (nonignored) labels
+     *    present; otherwise the loss is simply summed over spatial locations.
+     */
+    explicit SoftmaxWithLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+    ~SoftmaxWithLossLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SoftmaxWithLoss";
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return -1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline int MaxTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    /// @copydoc SoftmaxWithLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /**
+     * @brief Computes the softmax loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ x @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial x} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    void ocl_setup();
+
+    /// The internal SoftmaxLayer used to map predictions to a distribution.
+    shared_ptr<Layer<Dtype> > softmax_layer_;
+    /// prob stores the output probability predictions from the SoftmaxLayer.
+    Blob<Dtype> prob_;
+    /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+    vector<Blob<Dtype>*> softmax_bottom_vec_;
+    /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+    vector<Blob<Dtype>*> softmax_top_vec_;
+    /// Whether to ignore instances with a certain label.
+    bool has_ignore_label_;
+    /// The label indicating that an instance should be ignored.
+    int ignore_label_;
+    /// Whether to normalize the loss by the total number of values present
+    /// (otherwise just by the batch size).
+    bool normalize_;
+
+    int softmax_axis_, outer_num_, inner_num_;
+
+  protected:
+    cl_kernel diff_kernel, scal_kernel, softmax_kernel;
+    cl_mem d_loss;
+    cl_kernel softmax_loss_fp_kernel;
+    cl_kernel softmax_loss_bp_kernel;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 2fe273f5..bbd61b88 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -22,264 +22,264 @@ namespace caffe {
  */
 template <typename Dtype>
 class Net {
-	public:
-		explicit Net(const NetParameter& param);
-		explicit Net(const string& param_file, Phase phase);
-		virtual ~Net() {
-		}
+  public:
+    explicit Net(const NetParameter& param);
+    explicit Net(const string& param_file, Phase phase);
+    virtual ~Net() {
+    }
 
-		/// @brief Initialize a network with a NetParameter.
-		void Init(const NetParameter& param);
+    /// @brief Initialize a network with a NetParameter.
+    void Init(const NetParameter& param);
 
-		/**
-		 * @brief Run Forward with the input Blob%s already fed separately.
-		 *
-		 * You can get the input blobs using input_blobs().
-		 */
-		const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
+    /**
+     * @brief Run Forward with the input Blob%s already fed separately.
+     *
+     * You can get the input blobs using input_blobs().
+     */
+    const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
 
-		/**
-		 * The From and To variants of Forward and Backward operate on the
-		 * (topological) ordering by which the net is specified. For general DAG
-		 * networks, note that (1) computing from one layer to another might entail
-		 * extra computation on unrelated branches, and (2) computation starting in
-		 * the middle may be incorrect if all of the layers of a fan-in are not
-		 * included.
-		 */
-		Dtype ForwardFromTo(int start, int end);
-		Dtype ForwardFrom(int start);
-		Dtype ForwardTo(int end);
-		/// @brief Run forward using a set of bottom blobs, and return the result.
-		const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>*> & bottom,
-				Dtype* loss = NULL);
-		/**
-		 * @brief Run forward using a serialized BlobProtoVector and return the
-		 *        result as a serialized BlobProtoVector
-		 */
-		string Forward(const string& input_blob_protos, Dtype* loss = NULL);
+    /**
+     * The From and To variants of Forward and Backward operate on the
+     * (topological) ordering by which the net is specified. For general DAG
+     * networks, note that (1) computing from one layer to another might entail
+     * extra computation on unrelated branches, and (2) computation starting in
+     * the middle may be incorrect if all of the layers of a fan-in are not
+     * included.
+     */
+    Dtype ForwardFromTo(int start, int end);
+    Dtype ForwardFrom(int start);
+    Dtype ForwardTo(int end);
+    /// @brief Run forward using a set of bottom blobs, and return the result.
+    const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>*> & bottom,
+        Dtype* loss = NULL);
+    /**
+     * @brief Run forward using a serialized BlobProtoVector and return the
+     *        result as a serialized BlobProtoVector
+     */
+    string Forward(const string& input_blob_protos, Dtype* loss = NULL);
 
-		/**
-		 * The network backward should take no input and output, since it solely
-		 * computes the gradient w.r.t the parameters, and the data has already been
-		 * provided during the forward pass.
-		 */
-		void Backward();
-		void BackwardFromTo(int start, int end);
-		void BackwardFrom(int start);
-		void BackwardTo(int end);
+    /**
+     * The network backward should take no input and output, since it solely
+     * computes the gradient w.r.t the parameters, and the data has already been
+     * provided during the forward pass.
+     */
+    void Backward();
+    void BackwardFromTo(int start, int end);
+    void BackwardFrom(int start);
+    void BackwardTo(int end);
 
-		/**
-		 * @brief Reshape all layers from bottom to top.
-		 *
-		 * This is useful to propagate changes to layer sizes without running
-		 * a forward pass, e.g. to compute output feature size.
-		 */
-		void Reshape();
+    /**
+     * @brief Reshape all layers from bottom to top.
+     *
+     * This is useful to propagate changes to layer sizes without running
+     * a forward pass, e.g. to compute output feature size.
+     */
+    void Reshape();
 
-		Dtype ForwardBackward(const vector<Blob<Dtype>*> & bottom) {
-			Dtype loss;
-			Forward(bottom, &loss);
-			Backward();
-			return loss;
-		}
+    Dtype ForwardBackward(const vector<Blob<Dtype>*> & bottom) {
+      Dtype loss;
+      Forward(bottom, &loss);
+      Backward();
+      return loss;
+    }
 
-		/// @brief Updates the network weights based on the diff values computed.
-		void Update();
+    /// @brief Updates the network weights based on the diff values computed.
+    void Update();
 
-		/**
-		 * @brief For an already initialized net, implicitly copies (i.e., using no
-		 *        additional memory) the pre-trained layers from another Net.
-		 */
-		void ShareTrainedLayersWith(const Net* other);
-		// For an already initialized net, CopyTrainedLayersFrom() copies the already
-		// trained layers from another net parameter instance.
-		/**
-		 * @brief For an already initialized net, copies the pre-trained layers from
-		 *        another Net.
-		 */
-		void CopyTrainedLayersFrom(const NetParameter& param);
-		void CopyTrainedLayersFrom(const string trained_filename);
-		/// @brief Writes the net to a proto.
-		void ToProto(NetParameter* param, bool write_diff = false) const;
+    /**
+     * @brief For an already initialized net, implicitly copies (i.e., using no
+     *        additional memory) the pre-trained layers from another Net.
+     */
+    void ShareTrainedLayersWith(const Net* other);
+    // For an already initialized net, CopyTrainedLayersFrom() copies the already
+    // trained layers from another net parameter instance.
+    /**
+     * @brief For an already initialized net, copies the pre-trained layers from
+     *        another Net.
+     */
+    void CopyTrainedLayersFrom(const NetParameter& param);
+    void CopyTrainedLayersFrom(const string trained_filename);
+    /// @brief Writes the net to a proto.
+    void ToProto(NetParameter* param, bool write_diff = false) const;
 
-		/// @brief returns the network name.
-		inline const string& name() const {
-			return name_;
-		}
-		/// @brief returns the layer names
-		inline const vector<string>& layer_names() const {
-			return layer_names_;
-		}
-		/// @brief returns the blob names
-		inline const vector<string>& blob_names() const {
-			return blob_names_;
-		}
-		/// @brief returns the blobs
-		inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
-			return blobs_;
-		}
-		/// @brief returns the layers
-		inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
-			return layers_;
-		}
-		/// @brief returns the phase: TRAIN or TEST
-		inline Phase phase() const {
-			return phase_;
-		}
-		/**
-		 * @brief returns the bottom vecs for each layer -- usually you won't
-		 *        need this unless you do per-layer checks such as gradients.
-		 */
-		inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
-			return bottom_vecs_;
-		}
-		/**
-		 * @brief returns the top vecs for each layer -- usually you won't
-		 *        need this unless you do per-layer checks such as gradients.
-		 */
-		inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
-			return top_vecs_;
-		}
-		inline const vector<vector<bool> >& bottom_need_backward() const {
-			return bottom_need_backward_;
-		}
-		inline const vector<Dtype>& blob_loss_weights() const {
-			return blob_loss_weights_;
-		}
-		inline const vector<bool>& layer_need_backward() const {
-			return layer_need_backward_;
-		}
-		/// @brief returns the parameters
-		inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
-			return params_;
-		}
-		/// @brief returns the parameter learning rate multipliers
-		inline const vector<float>& params_lr() const {
-			return params_lr_;
-		}
-		inline const vector<float>& params_weight_decay() const {
-			return params_weight_decay_;
-		}
-		const map<string, int>& param_names_index() const {
-			return param_names_index_;
-		}
-		inline const vector<int>& param_owners() const {
-			return param_owners_;
-		}
-		/// @brief Input and output blob numbers
-		inline int num_inputs() const {
-			return net_input_blobs_.size();
-		}
-		inline int num_outputs() const {
-			return net_output_blobs_.size();
-		}
-		inline const vector<Blob<Dtype>*>& input_blobs() const {
-			return net_input_blobs_;
-		}
-		inline const vector<Blob<Dtype>*>& output_blobs() const {
-			return net_output_blobs_;
-		}
-		inline const vector<int>& input_blob_indices() const {
-			return net_input_blob_indices_;
-		}
-		inline const vector<int>& output_blob_indices() const {
-			return net_output_blob_indices_;
-		}
-		bool has_blob(const string& blob_name) const;
-		const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
-		bool has_layer(const string& layer_name) const;
-		const shared_ptr<Layer<Dtype> > layer_by_name(
-				const string& layer_name) const;
+    /// @brief returns the network name.
+    inline const string& name() const {
+      return name_;
+    }
+    /// @brief returns the layer names
+    inline const vector<string>& layer_names() const {
+      return layer_names_;
+    }
+    /// @brief returns the blob names
+    inline const vector<string>& blob_names() const {
+      return blob_names_;
+    }
+    /// @brief returns the blobs
+    inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
+      return blobs_;
+    }
+    /// @brief returns the layers
+    inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
+      return layers_;
+    }
+    /// @brief returns the phase: TRAIN or TEST
+    inline Phase phase() const {
+      return phase_;
+    }
+    /**
+     * @brief returns the bottom vecs for each layer -- usually you won't
+     *        need this unless you do per-layer checks such as gradients.
+     */
+    inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
+      return bottom_vecs_;
+    }
+    /**
+     * @brief returns the top vecs for each layer -- usually you won't
+     *        need this unless you do per-layer checks such as gradients.
+     */
+    inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
+      return top_vecs_;
+    }
+    inline const vector<vector<bool> >& bottom_need_backward() const {
+      return bottom_need_backward_;
+    }
+    inline const vector<Dtype>& blob_loss_weights() const {
+      return blob_loss_weights_;
+    }
+    inline const vector<bool>& layer_need_backward() const {
+      return layer_need_backward_;
+    }
+    /// @brief returns the parameters
+    inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
+      return params_;
+    }
+    /// @brief returns the parameter learning rate multipliers
+    inline const vector<float>& params_lr() const {
+      return params_lr_;
+    }
+    inline const vector<float>& params_weight_decay() const {
+      return params_weight_decay_;
+    }
+    const map<string, int>& param_names_index() const {
+      return param_names_index_;
+    }
+    inline const vector<int>& param_owners() const {
+      return param_owners_;
+    }
+    /// @brief Input and output blob numbers
+    inline int num_inputs() const {
+      return net_input_blobs_.size();
+    }
+    inline int num_outputs() const {
+      return net_output_blobs_.size();
+    }
+    inline const vector<Blob<Dtype>*>& input_blobs() const {
+      return net_input_blobs_;
+    }
+    inline const vector<Blob<Dtype>*>& output_blobs() const {
+      return net_output_blobs_;
+    }
+    inline const vector<int>& input_blob_indices() const {
+      return net_input_blob_indices_;
+    }
+    inline const vector<int>& output_blob_indices() const {
+      return net_output_blob_indices_;
+    }
+    bool has_blob(const string& blob_name) const;
+    const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
+    bool has_layer(const string& layer_name) const;
+    const shared_ptr<Layer<Dtype> > layer_by_name(
+        const string& layer_name) const;
 
-		void set_debug_info(const bool value) {
-			debug_info_ = value;
-		}
+    void set_debug_info(const bool value) {
+      debug_info_ = value;
+    }
 
-		// Helpers for Init.
-		/**
-		 * @brief Remove layers that the user specified should be excluded given the current
-		 *        phase, level, and stage.
-		 */
-		static void FilterNet(const NetParameter& param,
-				NetParameter* param_filtered);
-		/// @brief return whether NetState state meets NetStateRule rule
-		static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
-				const string& layer_name);
+    // Helpers for Init.
+    /**
+     * @brief Remove layers that the user specified should be excluded given the current
+     *        phase, level, and stage.
+     */
+    static void FilterNet(const NetParameter& param,
+        NetParameter* param_filtered);
+    /// @brief return whether NetState state meets NetStateRule rule
+    static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
+        const string& layer_name);
 
-	protected:
-		// Helpers for Init.
-		/// @brief Append a new input or top blob to the net.
-		void AppendTop(const NetParameter& param, const int layer_id,
-				const int top_id, set<string>* available_blobs,
-				map<string, int>* blob_name_to_idx);
-		/// @brief Append a new bottom blob to the net.
-		int AppendBottom(const NetParameter& param, const int layer_id,
-				const int bottom_id, set<string>* available_blobs,
-				map<string, int>* blob_name_to_idx);
-		/// @brief Append a new parameter blob to the net.
-		void AppendParam(const NetParameter& param, const int layer_id,
-				const int param_id);
+  protected:
+    // Helpers for Init.
+    /// @brief Append a new input or top blob to the net.
+    void AppendTop(const NetParameter& param, const int layer_id,
+        const int top_id, set<string>* available_blobs,
+        map<string, int>* blob_name_to_idx);
+    /// @brief Append a new bottom blob to the net.
+    int AppendBottom(const NetParameter& param, const int layer_id,
+        const int bottom_id, set<string>* available_blobs,
+        map<string, int>* blob_name_to_idx);
+    /// @brief Append a new parameter blob to the net.
+    void AppendParam(const NetParameter& param, const int layer_id,
+        const int param_id);
 
-		/// @brief Helper for displaying debug info in Forward about input Blobs.
-		void InputDebugInfo(const int layer_id);
-		/// @brief Helper for displaying debug info in Forward.
-		void ForwardDebugInfo(const int layer_id);
-		/// @brief Helper for displaying debug info in Backward.
-		void BackwardDebugInfo(const int layer_id);
-		/// @brief Helper for displaying debug info in Update.
-		void UpdateDebugInfo(const int param_id);
+    /// @brief Helper for displaying debug info in Forward about input Blobs.
+    void InputDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Forward.
+    void ForwardDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Backward.
+    void BackwardDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Update.
+    void UpdateDebugInfo(const int param_id);
 
-		/// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
-		void GetLearningRateAndWeightDecay();
+    /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
+    void GetLearningRateAndWeightDecay();
 
-		/// @brief The network name
-		string name_;
-		/// @brief The phase: TRAIN or TEST
-		Phase phase_;
-		/// @brief Individual layers in the net
-		vector<shared_ptr<Layer<Dtype> > > layers_;
-		vector<string> layer_names_;
-		map<string, int> layer_names_index_;
-		vector<bool> layer_need_backward_;
-		/// @brief the blobs storing intermediate results between the layer.
-		vector<shared_ptr<Blob<Dtype> > > blobs_;
-		vector<string> blob_names_;
-		map<string, int> blob_names_index_;
-		vector<bool> blob_need_backward_;
-		/// bottom_vecs stores the vectors containing the input for each layer.
-		/// They don't actually host the blobs (blobs_ does), so we simply store
-		/// pointers.
-		vector<vector<Blob<Dtype>*> > bottom_vecs_;
-		vector<vector<int> > bottom_id_vecs_;
-		vector<vector<bool> > bottom_need_backward_;
-		/// top_vecs stores the vectors containing the output for each layer
-		vector<vector<Blob<Dtype>*> > top_vecs_;
-		vector<vector<int> > top_id_vecs_;
-		/// Vector of weight in the loss (or objective) function of each net blob,
-		/// indexed by blob_id.
-		vector<Dtype> blob_loss_weights_;
-		vector<vector<int> > param_id_vecs_;
-		vector<int> param_owners_;
-		vector<string> param_display_names_;
-		vector<pair<int, int> > param_layer_indices_;
-		map<string, int> param_names_index_;
-		/// blob indices for the input and the output of the net
-		vector<int> net_input_blob_indices_;
-		vector<int> net_output_blob_indices_;
-		vector<Blob<Dtype>*> net_input_blobs_;
-		vector<Blob<Dtype>*> net_output_blobs_;
-		/// The parameters in the network.
-		vector<shared_ptr<Blob<Dtype> > > params_;
-		/// the learning rate multipliers
-		vector<float> params_lr_;
-		/// the weight decay multipliers
-		vector<float> params_weight_decay_;
-		/// The bytes of memory used by this net
-		size_t memory_used_;
-		/// Whether to compute and display debug info for the net.
-		bool debug_info_;
+    /// @brief The network name
+    string name_;
+    /// @brief The phase: TRAIN or TEST
+    Phase phase_;
+    /// @brief Individual layers in the net
+    vector<shared_ptr<Layer<Dtype> > > layers_;
+    vector<string> layer_names_;
+    map<string, int> layer_names_index_;
+    vector<bool> layer_need_backward_;
+    /// @brief the blobs storing intermediate results between the layer.
+    vector<shared_ptr<Blob<Dtype> > > blobs_;
+    vector<string> blob_names_;
+    map<string, int> blob_names_index_;
+    vector<bool> blob_need_backward_;
+    /// bottom_vecs stores the vectors containing the input for each layer.
+    /// They don't actually host the blobs (blobs_ does), so we simply store
+    /// pointers.
+    vector<vector<Blob<Dtype>*> > bottom_vecs_;
+    vector<vector<int> > bottom_id_vecs_;
+    vector<vector<bool> > bottom_need_backward_;
+    /// top_vecs stores the vectors containing the output for each layer
+    vector<vector<Blob<Dtype>*> > top_vecs_;
+    vector<vector<int> > top_id_vecs_;
+    /// Vector of weight in the loss (or objective) function of each net blob,
+    /// indexed by blob_id.
+    vector<Dtype> blob_loss_weights_;
+    vector<vector<int> > param_id_vecs_;
+    vector<int> param_owners_;
+    vector<string> param_display_names_;
+    vector<pair<int, int> > param_layer_indices_;
+    map<string, int> param_names_index_;
+    /// blob indices for the input and the output of the net
+    vector<int> net_input_blob_indices_;
+    vector<int> net_output_blob_indices_;
+    vector<Blob<Dtype>*> net_input_blobs_;
+    vector<Blob<Dtype>*> net_output_blobs_;
+    /// The parameters in the network.
+    vector<shared_ptr<Blob<Dtype> > > params_;
+    /// the learning rate multipliers
+    vector<float> params_lr_;
+    /// the weight decay multipliers
+    vector<float> params_weight_decay_;
+    /// The bytes of memory used by this net
+    size_t memory_used_;
+    /// Whether to compute and display debug info for the net.
+    bool debug_info_;
 
-		DISABLE_COPY_AND_ASSIGN (Net);
+    DISABLE_COPY_AND_ASSIGN (Net);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 89b6c481..54267f12 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -24,20 +24,19 @@ namespace caffe {
  */
 template <typename Dtype>
 class NeuronLayer: public Layer<Dtype> {
-	public:
-		explicit NeuronLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
+  public:
+    explicit NeuronLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
 };
 
 /**
@@ -52,52 +51,51 @@ class NeuronLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class AbsValLayer: public NeuronLayer<Dtype> {
-	public:
-		explicit AbsValLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "AbsVal";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		/// @copydoc AbsValLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the absolute value inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x} =
-		 *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit AbsValLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "AbsVal";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /// @copydoc AbsValLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the absolute value inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -119,43 +117,42 @@ class AbsValLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class BNLLLayer: public NeuronLayer<Dtype> {
-	public:
-		explicit BNLLLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "BNLL";
-		}
-
-	protected:
-		/// @copydoc BNLLLayer
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the BNLL inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 2)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x}
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit BNLLLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "BNLL";
+    }
+
+  protected:
+    /// @copydoc BNLLLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the BNLL inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -171,65 +168,64 @@ class BNLLLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class DropoutLayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides DropoutParameter dropout_param,
-		 *     with DropoutLayer options:
-		 *   - dropout_ratio (\b optional, default 0.5).
-		 *     Sets the probability @f$ p @f$ that any given unit is dropped.
-		 */
-		explicit DropoutLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Dropout";
-		}
-		virtual ~DropoutLayer();
-		void ocl_setup(int bottom_count);
-		cl_mem MaskMem;
-		cl_kernel ocl_Kernel_Fwd;
-		cl_kernel ocl_Kernel_Bwd;
-		cl_kernel rng_kernel;
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs. At training time, we have @f$
-		 *      y_{\mbox{train}} = \left\{
-		 *         \begin{array}{ll}
-		 *            \frac{x}{1 - p} & \mbox{if } u > p \\
+  public:
+    /**
+     * @param param provides DropoutParameter dropout_param,
+     *     with DropoutLayer options:
+     *   - dropout_ratio (\b optional, default 0.5).
+     *     Sets the probability @f$ p @f$ that any given unit is dropped.
+     */
+    explicit DropoutLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Dropout";
+    }
+    virtual ~DropoutLayer();
+    void ocl_setup(int bottom_count);
+    cl_mem MaskMem;
+    cl_kernel ocl_Kernel_Fwd;
+    cl_kernel ocl_Kernel_Bwd;
+    cl_kernel rng_kernel;
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs. At training time, we have @f$
+     *      y_{\mbox{train}} = \left\{
+     *         \begin{array}{ll}
+     *            \frac{x}{1 - p} & \mbox{if } u > p \\
    *            0 & \mbox{otherwise}
-		 *         \end{array} \right.
-		 *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
-		 *      input at each iteration. At test time, we simply have
-		 *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		/// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
-		Blob<unsigned int> rand_vec_;
-		/// the probability @f$ p @f$ of dropping any input
-		Dtype threshold_;
-		/// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
-		Dtype scale_;
-		unsigned int uint_thres_;
+     *         \end{array} \right.
+     *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
+     *      input at each iteration. At test time, we simply have
+     *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
+    Blob<unsigned int> rand_vec_;
+    /// the probability @f$ p @f$ of dropping any input
+    Dtype threshold_;
+    /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
+    Dtype scale_;
+    unsigned int uint_thres_;
 };
 
 /**
@@ -239,65 +235,64 @@ class DropoutLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class ExpLayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides ExpParameter exp_param,
-		 *     with ExpLayer options:
-		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-		 *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-		 *         the base @f$ \gamma @f$
-		 */
-		explicit ExpLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Exp";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = \gamma ^ {\alpha x + \beta}
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the exp inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x} =
-		 *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Dtype inner_scale_, outer_scale_;
+  public:
+    /**
+     * @param param provides ExpParameter exp_param,
+     *     with ExpLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+     *         the base @f$ \gamma @f$
+     */
+    explicit ExpLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Exp";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \gamma ^ {\alpha x + \beta}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the exp inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Dtype inner_scale_, outer_scale_;
 };
 
 /**
@@ -307,67 +302,66 @@ class ExpLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class LogLayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides LogParameter log_param,
-		 *     with LogLayer options:
-		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-		 *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-		 *         the base @f$ \gamma @f$
-		 */
-		explicit LogLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Log";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = log_{\gamma}(\alpha x + \beta)
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the exp inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x} =
-		 *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		Dtype base_scale_;
-		Dtype input_scale_, input_shift_;
-		Dtype backward_num_scale_;
+  public:
+    /**
+     * @param param provides LogParameter log_param,
+     *     with LogLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+     *         the base @f$ \gamma @f$
+     */
+    explicit LogLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Log";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = log_{\gamma}(\alpha x + \beta)
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the exp inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Dtype base_scale_;
+    Dtype input_scale_, input_shift_;
+    Dtype backward_num_scale_;
 };
 
 /**
@@ -377,74 +371,73 @@ class LogLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class PowerLayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides PowerParameter power_param,
-		 *     with PowerLayer options:
-		 *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-		 *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-		 *   - power (\b optional, default 1) the power @f$ \gamma @f$
-		 */
-		explicit PowerLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Power";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = (\alpha x + \beta) ^ \gamma
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the power inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x} =
-		 *            \frac{\partial E}{\partial y}
-		 *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
-		 *            \frac{\partial E}{\partial y}
-		 *            \frac{\alpha \gamma y}{\alpha x + \beta}
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		/// @brief @f$ \gamma @f$ from layer_param_.power_param()
-		Dtype power_;
-		/// @brief @f$ \alpha @f$ from layer_param_.power_param()
-		Dtype scale_;
-		/// @brief @f$ \beta @f$ from layer_param_.power_param()
-		Dtype shift_;
-		/// @brief Result of @f$ \alpha \gamma @f$
-		Dtype diff_scale_;
+  public:
+    /**
+     * @param param provides PowerParameter power_param,
+     *     with PowerLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - power (\b optional, default 1) the power @f$ \gamma @f$
+     */
+    explicit PowerLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Power";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = (\alpha x + \beta) ^ \gamma
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the power inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y}
+     *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
+     *            \frac{\partial E}{\partial y}
+     *            \frac{\alpha \gamma y}{\alpha x + \beta}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// @brief @f$ \gamma @f$ from layer_param_.power_param()
+    Dtype power_;
+    /// @brief @f$ \alpha @f$ from layer_param_.power_param()
+    Dtype scale_;
+    /// @brief @f$ \beta @f$ from layer_param_.power_param()
+    Dtype shift_;
+    /// @brief Result of @f$ \alpha \gamma @f$
+    Dtype diff_scale_;
 };
 
 /**
@@ -453,70 +446,69 @@ class PowerLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class ReLULayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides ReLUParameter relu_param,
-		 *     with ReLULayer options:
-		 *   - negative_slope (\b optional, default 0).
-		 *     the value @f$ \nu @f$ by which negative values are multiplied.
-		 */
-		explicit ReLULayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual inline const char* type() const {
-			return "ReLU";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = \max(0, x)
-		 *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
-		 *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the ReLU inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x} = \left\{
-		 *        \begin{array}{lr}
-		 *            0 & \mathrm{if} \; x \le 0 \\
+  public:
+    /**
+     * @param param provides ReLUParameter relu_param,
+     *     with ReLULayer options:
+     *   - negative_slope (\b optional, default 0).
+     *     the value @f$ \nu @f$ by which negative values are multiplied.
+     */
+    explicit ReLULayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual inline const char* type() const {
+      return "ReLU";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \max(0, x)
+     *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
+     *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the ReLU inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} = \left\{
+     *        \begin{array}{lr}
+     *            0 & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-		 *        \end{array} \right.
-		 *      @f$ if propagate_down[0], by default.
-		 *      If a non-zero negative_slope @f$ \nu @f$ is provided,
-		 *      the computed gradients are @f$
-		 *        \frac{\partial E}{\partial x} = \left\{
-		 *        \begin{array}{lr}
-		 *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
+     *        \end{array} \right.
+     *      @f$ if propagate_down[0], by default.
+     *      If a non-zero negative_slope @f$ \nu @f$ is provided,
+     *      the computed gradients are @f$
+     *        \frac{\partial E}{\partial x} = \left\{
+     *        \begin{array}{lr}
+     *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-		 *        \end{array} \right.
-		 *      @f$.
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+     *        \end{array} \right.
+     *      @f$.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -525,25 +517,25 @@ class ReLULayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNReLULayer : public ReLULayer<Dtype> {
-	public:
-	explicit CuDNNReLULayer(const LayerParameter& param)
-	: ReLULayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNReLULayer();
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t handle_;
-	cudnnTensorDescriptor_t bottom_desc_;
-	cudnnTensorDescriptor_t top_desc_;
+  public:
+  explicit CuDNNReLULayer(const LayerParameter& param)
+  : ReLULayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNReLULayer();
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -557,53 +549,52 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
  */
 template <typename Dtype>
 class SigmoidLayer: public NeuronLayer<Dtype> {
-	public:
-		explicit SigmoidLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "Sigmoid";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = (1 + \exp(-x))^{-1}
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x}
-		 *            = \frac{\partial E}{\partial y} y (1 - y)
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit SigmoidLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Sigmoid";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = (1 + \exp(-x))^{-1}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *            = \frac{\partial E}{\partial y} y (1 - y)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -612,25 +603,25 @@ class SigmoidLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
-	public:
-	explicit CuDNNSigmoidLayer(const LayerParameter& param)
-	: SigmoidLayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNSigmoidLayer();
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t handle_;
-	cudnnTensorDescriptor_t bottom_desc_;
-	cudnnTensorDescriptor_t top_desc_;
+  public:
+  explicit CuDNNSigmoidLayer(const LayerParameter& param)
+  : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNSigmoidLayer();
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -644,55 +635,54 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
  */
 template <typename Dtype>
 class TanHLayer: public NeuronLayer<Dtype> {
-	public:
-		explicit TanHLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "TanH";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$; Backward fills their diff with
-		 *      gradients @f$
-		 *        \frac{\partial E}{\partial x}
-		 *            = \frac{\partial E}{\partial y}
-		 *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
-		 *            = \frac{\partial E}{\partial y} (1 - y^2)
-		 *      @f$ if propagate_down[0]
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    explicit TanHLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "TanH";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *            = \frac{\partial E}{\partial y}
+     *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
+     *            = \frac{\partial E}{\partial y} (1 - y^2)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -701,25 +691,25 @@ class TanHLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNTanHLayer : public TanHLayer<Dtype> {
-	public:
-	explicit CuDNNTanHLayer(const LayerParameter& param)
-	: TanHLayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNTanHLayer();
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t handle_;
-	cudnnTensorDescriptor_t bottom_desc_;
-	cudnnTensorDescriptor_t top_desc_;
+  public:
+  explicit CuDNNTanHLayer(const LayerParameter& param)
+  : TanHLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNTanHLayer();
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
 };
 #endif
 
@@ -729,51 +719,50 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
  */
 template <typename Dtype>
 class ThresholdLayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides ThresholdParameter threshold_param,
-		 *     with ThresholdLayer options:
-		 *   - threshold (\b optional, default 0).
-		 *     the threshold value @f$ t @f$ to which the input values are compared.
-		 */
-		explicit ThresholdLayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Threshold";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times H \times W) @f$
-		 *      the computed outputs @f$
-		 *       y = \left\{
-		 *       \begin{array}{lr}
-		 *         0 & \mathrm{if} \; x \le t \\
+  public:
+    /**
+     * @param param provides ThresholdParameter threshold_param,
+     *     with ThresholdLayer options:
+     *   - threshold (\b optional, default 0).
+     *     the threshold value @f$ t @f$ to which the input values are compared.
+     */
+    explicit ThresholdLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Threshold";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *       y = \left\{
+     *       \begin{array}{lr}
+     *         0 & \mathrm{if} \; x \le t \\
    *         1 & \mathrm{if} \; x > t
-		 *       \end{array} \right.
-		 *      @f$
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		/// @brief Not implemented (non-differentiable function)
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-			NOT_IMPLEMENTED;
-		}
-
-		Dtype threshold_;
+     *       \end{array} \right.
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /// @brief Not implemented (non-differentiable function)
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      NOT_IMPLEMENTED;
+    }
+
+    Dtype threshold_;
 };
 
 /**
@@ -786,83 +775,82 @@ class ThresholdLayer: public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class PReLULayer: public NeuronLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides PReLUParameter prelu_param,
-		 *     with PReLULayer options:
-		 *   - filler (\b optional, FillerParameter,
-		 *     default {'type': constant 'value':0.25}).
-		 *   - channel_shared (\b optional, default false).
-		 *     negative slopes are shared across channels.
-		 */
-		explicit PReLULayer(const LayerParameter& param)
-		:
-				NeuronLayer<Dtype>(param) {
-		}
-
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "PReLU";
-		}
-
-	protected:
-		/**
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times ...) @f$
-		 *      the inputs @f$ x @f$
-		 * @param top output Blob vector (length 1)
-		 *   -# @f$ (N \times C \times ...) @f$
-		 *      the computed outputs for each channel @f$i@f$ @f$
-		 *        y_i = \max(0, x_i) + a_i \min(0, x_i)
-		 *      @f$.
-		 */
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		/**
-		 * @brief Computes the error gradient w.r.t. the PReLU inputs.
-		 *
-		 * @param top output Blob vector (length 1), providing the error gradient with
-		 *      respect to the outputs
-		 *   -# @f$ (N \times C \times ...) @f$
-		 *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-		 *      with respect to computed outputs @f$ y @f$
-		 * @param propagate_down see Layer::Backward.
-		 * @param bottom input Blob vector (length 1)
-		 *   -# @f$ (N \times C \times ...) @f$
-		 *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
-		 *      diff with gradients @f$
-		 *        \frac{\partial E}{\partial x_i} = \left\{
-		 *        \begin{array}{lr}
-		 *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+  public:
+    /**
+     * @param param provides PReLUParameter prelu_param,
+     *     with PReLULayer options:
+     *   - filler (\b optional, FillerParameter,
+     *     default {'type': constant 'value':0.25}).
+     *   - channel_shared (\b optional, default false).
+     *     negative slopes are shared across channels.
+     */
+    explicit PReLULayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "PReLU";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the computed outputs for each channel @f$i@f$ @f$
+     *        y_i = \max(0, x_i) + a_i \min(0, x_i)
+     *      @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the PReLU inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times ...) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
+     *      diff with gradients @f$
+     *        \frac{\partial E}{\partial x_i} = \left\{
+     *        \begin{array}{lr}
+     *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0
-		 *        \end{array} \right.
-		 *      @f$.
-		 *      If param_propagate_down_[0] is true, it fills the diff with gradients
-		 *      @f$
-		 *        \frac{\partial E}{\partial a_i} = \left\{
-		 *        \begin{array}{lr}
-		 *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+     *        \end{array} \right.
+     *      @f$.
+     *      If param_propagate_down_[0] is true, it fills the diff with gradients
+     *      @f$
+     *        \frac{\partial E}{\partial a_i} = \left\{
+     *        \begin{array}{lr}
+     *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            0 & \mathrm{if} \; x_i > 0
-		 *        \end{array} \right.
-		 *      @f$.
-		 */
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		bool channel_shared_;
-		Blob<Dtype> multiplier_; // dot multiplier for backward computation of params
-		Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
-		Blob<Dtype> bottom_memory_;  // memory for in-place computation
+     *        \end{array} \right.
+     *      @f$.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    bool channel_shared_;
+    Blob<Dtype> multiplier_; // dot multiplier for backward computation of params
+    Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
+    Blob<Dtype> bottom_memory_;  // memory for in-place computation
 };
 
 }  // namespace caffe
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 41e2c21a..16d1f7fc 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -12,59 +12,58 @@ namespace caffe {
 
 template <typename Dtype>
 class PythonLayer: public Layer<Dtype> {
-	public:
-		PythonLayer(PyObject* self, const LayerParameter& param)
-		:
-				Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
-		}
+  public:
+    PythonLayer(PyObject* self, const LayerParameter& param)
+        : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
+    }
 
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			try {
-				self_.attr("setup")(bottom, top);
-			} catch (bp::error_already_set) {
-				PyErr_Print();
-				throw;
-			}
-		}
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("setup")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
+    }
 
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			try {
-				self_.attr("reshape")(bottom, top);
-			} catch (bp::error_already_set) {
-				PyErr_Print();
-				throw;
-			}
-		}
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("reshape")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
+    }
 
-		virtual inline const char* type() const {
-			return "Python";
-		}
+    virtual inline const char* type() const {
+      return "Python";
+    }
 
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top) {
-			try {
-				self_.attr("forward")(bottom, top);
-			} catch (bp::error_already_set) {
-				PyErr_Print();
-				throw;
-			}
-		}
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down,
-				const vector<Blob<Dtype>*>& bottom) {
-			try {
-				self_.attr("backward")(top, propagate_down, bottom);
-			} catch (bp::error_already_set) {
-				PyErr_Print();
-				throw;
-			}
-		}
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("forward")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
+    }
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      try {
+        self_.attr("backward")(top, propagate_down, bottom);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
+    }
 
-	private:
-		bp::object self_;
+  private:
+    bp::object self_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 60dbc5b0..2bddb77f 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -16,62 +16,62 @@ namespace caffe {
  */
 template <typename Dtype>
 class Solver {
-	public:
-		explicit Solver(const SolverParameter& param);
-		explicit Solver(const string& param_file);
-		void Init(const SolverParameter& param);
-		void InitTrainNet();
-		void InitTestNets();
-		// The main entry of the solver function. In default, iter will be zero. Pass
-		// in a non-zero iter number to resume training for a pre-trained net.
-		virtual void Solve(const char* resume_file = NULL);
-		inline void Solve(const string resume_file) {
-			Solve(resume_file.c_str());
-		}
-		void Step(int iters);
-		// The Restore function implements how one should restore the solver to a
-		// previously snapshotted state. You should implement the RestoreSolverState()
-		// function that restores the state from a SolverState protocol buffer.
-		void Restore(const char* resume_file);
-		virtual ~Solver() {
-		}
-		inline shared_ptr<Net<Dtype> > net() {
-			return net_;
-		}
-		inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
-			return test_nets_;
-		}
-		int iter() {
-			return iter_;
-		}
-
-	protected:
-		// Make and apply the update value for the current iteration.
-		virtual void ApplyUpdate() = 0;
-		// The Solver::Snapshot function implements the basic snapshotting utility
-		// that stores the learned net. You should implement the SnapshotSolverState()
-		// function that produces a SolverState protocol buffer that needs to be
-		// written to disk together with the learned net.
-		void Snapshot();
-		// The test routine
-		void TestAll();
-		void Test(const int test_net_id = 0);
-		virtual void SnapshotSolverState(SolverState* state) = 0;
-		virtual void RestoreSolverState(const SolverState& state) = 0;
-
-		void DisplayOutputBlobs(const int net_id);
-
-		SolverParameter param_;
-		int iter_;
-		int current_step_;
-		shared_ptr<Net<Dtype> > net_;
-		vector<shared_ptr<Net<Dtype> > > test_nets_;
-
-		void ocl_setup();
-		protected:
-		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-
-		DISABLE_COPY_AND_ASSIGN (Solver);
+  public:
+    explicit Solver(const SolverParameter& param);
+    explicit Solver(const string& param_file);
+    void Init(const SolverParameter& param);
+    void InitTrainNet();
+    void InitTestNets();
+    // The main entry of the solver function. In default, iter will be zero. Pass
+    // in a non-zero iter number to resume training for a pre-trained net.
+    virtual void Solve(const char* resume_file = NULL);
+    inline void Solve(const string resume_file) {
+      Solve(resume_file.c_str());
+    }
+    void Step(int iters);
+    // The Restore function implements how one should restore the solver to a
+    // previously snapshotted state. You should implement the RestoreSolverState()
+    // function that restores the state from a SolverState protocol buffer.
+    void Restore(const char* resume_file);
+    virtual ~Solver() {
+    }
+    inline shared_ptr<Net<Dtype> > net() {
+      return net_;
+    }
+    inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
+      return test_nets_;
+    }
+    int iter() {
+      return iter_;
+    }
+
+  protected:
+    // Make and apply the update value for the current iteration.
+    virtual void ApplyUpdate() = 0;
+    // The Solver::Snapshot function implements the basic snapshotting utility
+    // that stores the learned net. You should implement the SnapshotSolverState()
+    // function that produces a SolverState protocol buffer that needs to be
+    // written to disk together with the learned net.
+    void Snapshot();
+    // The test routine
+    void TestAll();
+    void Test(const int test_net_id = 0);
+    virtual void SnapshotSolverState(SolverState* state) = 0;
+    virtual void RestoreSolverState(const SolverState& state) = 0;
+
+    void DisplayOutputBlobs(const int net_id);
+
+    SolverParameter param_;
+    int iter_;
+    int current_step_;
+    shared_ptr<Net<Dtype> > net_;
+    vector<shared_ptr<Net<Dtype> > > test_nets_;
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (Solver);
 };
 
 /**
@@ -80,109 +80,103 @@ class Solver {
  */
 template <typename Dtype>
 class SGDSolver: public Solver<Dtype> {
-	public:
-		explicit SGDSolver(const SolverParameter& param)
-		:
-				Solver<Dtype>(param) {
-			PreSolve();
-		}
-		explicit SGDSolver(const string& param_file)
-		:
-				Solver<Dtype>(param_file) {
-			PreSolve();
-		}
-
-		const vector<shared_ptr<Blob<Dtype> > >& history() {
-			return history_;
-		}
-
-	protected:
-		void PreSolve();
-		Dtype GetLearningRate();
-		virtual void ApplyUpdate();
-		virtual void Normalize(int param_id);
-		virtual void Regularize(int param_id);
-		virtual void ComputeUpdateValue(int param_id, Dtype rate);
-		virtual void ClipGradients();
-		virtual void SnapshotSolverState(SolverState * state);
-		virtual void RestoreSolverState(const SolverState& state);
-		// history maintains the historical momentum data.
-		// update maintains update related data and is not needed in snapshots.
-		// temp maintains other information that might be needed in computation
-		//   of gradients/updates and is not needed in snapshots
-		vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
-
-		void ocl_setup();
-		protected:
-		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-
-		DISABLE_COPY_AND_ASSIGN (SGDSolver);
+  public:
+    explicit SGDSolver(const SolverParameter& param)
+        : Solver<Dtype>(param) {
+      PreSolve();
+    }
+    explicit SGDSolver(const string& param_file)
+        : Solver<Dtype>(param_file) {
+      PreSolve();
+    }
+
+    const vector<shared_ptr<Blob<Dtype> > >& history() {
+      return history_;
+    }
+
+  protected:
+    void PreSolve();
+    Dtype GetLearningRate();
+    virtual void ApplyUpdate();
+    virtual void Normalize(int param_id);
+    virtual void Regularize(int param_id);
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+    virtual void ClipGradients();
+    virtual void SnapshotSolverState(SolverState * state);
+    virtual void RestoreSolverState(const SolverState& state);
+    // history maintains the historical momentum data.
+    // update maintains update related data and is not needed in snapshots.
+    // temp maintains other information that might be needed in computation
+    //   of gradients/updates and is not needed in snapshots
+    vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (SGDSolver);
 };
 
 template <typename Dtype>
 class NesterovSolver: public SGDSolver<Dtype> {
-	public:
-		explicit NesterovSolver(const SolverParameter& param)
-		:
-				SGDSolver<Dtype>(param) {
-		}
-		explicit NesterovSolver(const string& param_file)
-		:
-				SGDSolver<Dtype>(param_file) {
-		}
-
-	protected:
-		virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-		void ocl_setup();
-		protected:
-		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-
-		DISABLE_COPY_AND_ASSIGN (NesterovSolver);
+  public:
+    explicit NesterovSolver(const SolverParameter& param)
+        : SGDSolver<Dtype>(param) {
+    }
+    explicit NesterovSolver(const string& param_file)
+        : SGDSolver<Dtype>(param_file) {
+    }
+
+  protected:
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (NesterovSolver);
 };
 
 template <typename Dtype>
 class AdaGradSolver: public SGDSolver<Dtype> {
-	public:
-		explicit AdaGradSolver(const SolverParameter& param)
-		:
-				SGDSolver<Dtype>(param) {
-			constructor_sanity_check();
-		}
-		explicit AdaGradSolver(const string& param_file)
-		:
-				SGDSolver<Dtype>(param_file) {
-			constructor_sanity_check();
-		}
-
-	protected:
-		virtual void ComputeUpdateValue(int param_id, Dtype rate);
-		void constructor_sanity_check() {
-			CHECK_EQ(0, this->param_.momentum())
-					<< "Momentum cannot be used with AdaGrad.";
-		}
-
-		void ocl_setup();
-		protected:
-		cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
-		DISABLE_COPY_AND_ASSIGN (AdaGradSolver);
+  public:
+    explicit AdaGradSolver(const SolverParameter& param)
+        : SGDSolver<Dtype>(param) {
+      constructor_sanity_check();
+    }
+    explicit AdaGradSolver(const string& param_file)
+        : SGDSolver<Dtype>(param_file) {
+      constructor_sanity_check();
+    }
+
+  protected:
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+    void constructor_sanity_check() {
+      CHECK_EQ(0, this->param_.momentum())
+          << "Momentum cannot be used with AdaGrad.";
+    }
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+    DISABLE_COPY_AND_ASSIGN (AdaGradSolver);
 };
 
 template <typename Dtype>
 Solver<Dtype>* GetSolver(const SolverParameter& param) {
-	SolverParameter_SolverType type = param.solver_type();
-
-	switch (type) {
-		case SolverParameter_SolverType_SGD:
-			return new SGDSolver<Dtype>(param);
-		case SolverParameter_SolverType_NESTEROV:
-			return new NesterovSolver<Dtype>(param);
-		case SolverParameter_SolverType_ADAGRAD:
-			return new AdaGradSolver<Dtype>(param);
-		default:
-			LOG(FATAL) << "Unknown SolverType: " << type;
-	}
-	return (Solver<Dtype>*) NULL;
+  SolverParameter_SolverType type = param.solver_type();
+
+  switch (type) {
+  case SolverParameter_SolverType_SGD:
+    return new SGDSolver<Dtype>(param);
+  case SolverParameter_SolverType_NESTEROV:
+    return new NesterovSolver<Dtype>(param);
+  case SolverParameter_SolverType_ADAGRAD:
+    return new AdaGradSolver<Dtype>(param);
+  default:
+    LOG(FATAL) << "Unknown SolverType: " << type;
+  }
+  return (Solver<Dtype>*) NULL;
 }
 
 }  // namespace caffe
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 1a16c04a..1647b6f3 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -49,12 +49,12 @@ namespace caffe {
 // does not seem to create a memory bottleneck here.
 
 inline void CaffeMallocHost(void** ptr, size_t size) {
-	*ptr = malloc(size);
-	CHECK(*ptr) << "host allocation of size " << size << " failed";
+  *ptr = malloc(size);
+  CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr) {
-	free(ptr);
+  free(ptr);
 }
 
 /**
@@ -64,55 +64,53 @@ inline void CaffeFreeHost(void* ptr) {
  * TODO(dox): more thorough description.
  */
 class SyncedMemory {
-	public:
-		SyncedMemory()
-		:
-				cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-						own_cpu_data_(false), data_layer_(false) {
-			ocl_setup();
-		}
-		explicit SyncedMemory(size_t size)
-		:
-				cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-						own_cpu_data_(false), data_layer_(false) {
-			ocl_setup();
-		}
+  public:
+    SyncedMemory()
+        : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(
+            false), data_layer_(false) {
+      ocl_setup();
+    }
+    explicit SyncedMemory(size_t size)
+        : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(
+            false), data_layer_(false) {
+      ocl_setup();
+    }
 
-		~SyncedMemory();
-		const void* cpu_data();
-		void set_cpu_data(void* data);
-		const void* gpu_data();
-		const void* gpu_cache_data();
-		void* mutable_cpu_data();
-		void* mutable_gpu_data();
-		enum SyncedHead {
-			UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED
-		};
-		SyncedHead head() {
-			return head_;
-		}
-		size_t size() {
-			return size_;
-		}
-		void set_data_layer() {
-			data_layer_ = true;
-		}
-	private:
-		void ocl_setup();
-		protected:
-		cl_kernel oclmem_kernel;
+    ~SyncedMemory();
+    const void* cpu_data();
+    void set_cpu_data(void* data);
+    const void* gpu_data();
+    const void* gpu_cache_data();
+    void* mutable_cpu_data();
+    void* mutable_gpu_data();
+    enum SyncedHead {
+      UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED
+    };
+    SyncedHead head() {
+      return head_;
+    }
+    size_t size() {
+      return size_;
+    }
+    void set_data_layer() {
+      data_layer_ = true;
+    }
+  private:
+    void ocl_setup();
+  protected:
+    cl_kernel oclmem_kernel;
 
-	private:
-		void to_cpu();
-		void to_gpu();
-		void* cpu_ptr_;
-		void* gpu_ptr_;
-		void* gpu_cache_ptr_;
-		size_t size_;
-		SyncedHead head_;
-		bool own_cpu_data_;
-		bool data_layer_;
-		DISABLE_COPY_AND_ASSIGN (SyncedMemory);
+  private:
+    void to_cpu();
+    void to_gpu();
+    void* cpu_ptr_;
+    void* gpu_ptr_;
+    void* gpu_cache_ptr_;
+    size_t size_;
+    SyncedHead head_;
+    bool own_cpu_data_;
+    bool data_layer_;
+    DISABLE_COPY_AND_ASSIGN (SyncedMemory);
 };
 // class SyncedMemory
 
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index 179e31ca..401e2136 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -27,27 +27,27 @@ int main(int argc, char** argv);
 
 namespace caffe {
 
-template<typename TypeParam>
+template <typename TypeParam>
 class MultiDeviceTest: public ::testing::Test {
-	public:
-		typedef typename TypeParam::Dtype Dtype;
-		protected:
-		MultiDeviceTest() {
-			Caffe::set_mode(TypeParam::device);
-		}
-		virtual ~MultiDeviceTest() {
-		}
+  public:
+    typedef typename TypeParam::Dtype Dtype;
+  protected:
+    MultiDeviceTest() {
+      Caffe::set_mode(TypeParam::device);
+    }
+    virtual ~MultiDeviceTest() {
+    }
 };
 
 typedef ::testing::Types<float, double> TestDtypes;
 
-template<typename TypeParam>
+template <typename TypeParam>
 struct CPUDevice {
-		typedef TypeParam Dtype;
-		static const Caffe::Brew device = Caffe::CPU;
+    typedef TypeParam Dtype;
+    static const Caffe::Brew device = Caffe::CPU;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class CPUDeviceTest: public MultiDeviceTest<CPUDevice<Dtype> > {
 };
 
@@ -58,19 +58,18 @@ CPUDevice<double> > TestDtypesAndDevices;
 
 #else
 
-template<typename TypeParam>
+template <typename TypeParam>
 struct GPUDevice {
-		typedef TypeParam Dtype;
-		static const Caffe::Brew device = Caffe::GPU;
+    typedef TypeParam Dtype;
+    static const Caffe::Brew device = Caffe::GPU;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class GPUDeviceTest: public MultiDeviceTest<GPUDevice<Dtype> > {
 };
 
-typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>,
-	GPUDevice<float>, GPUDevice<double> >
-TestDtypesAndDevices;
+typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>, GPUDevice<float>,
+    GPUDevice<double> > TestDtypesAndDevices;
 
 #endif
 
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 07fe69cf..081ce203 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -15,244 +15,244 @@ namespace caffe {
 
 // The gradient checker adds a L2 normalization loss function on top of the
 // top blobs, and checks the gradient.
-template<typename Dtype>
+template <typename Dtype>
 class GradientChecker {
-	public:
-		// kink and kink_range specify an ignored nonsmooth region of the form
-		// kink - kink_range <= |feature value| <= kink + kink_range,
-		// which accounts for all nonsmoothness in use by caffe
-		GradientChecker(const Dtype stepsize, const Dtype threshold,
-			const unsigned int seed = 1701, const Dtype kink = 0.,
-			const Dtype kink_range = -1)
-			: stepsize_(stepsize), threshold_(threshold), seed_(seed),
-				kink_(kink), kink_range_(kink_range) {
-		}
-		// Checks the gradient of a layer, with provided bottom layers and top
-		// layers.
-		// Note that after the gradient check, we do not guarantee that the data
-		// stored in the layer parameters and the blobs are unchanged.
-		void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
-			layer->SetUp(bottom, top);
-			CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
-		}
-		void CheckGradientExhaustive(Layer<Dtype>* layer,
-			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-			int check_bottom = -1);
+  public:
+    // kink and kink_range specify an ignored nonsmooth region of the form
+    // kink - kink_range <= |feature value| <= kink + kink_range,
+    // which accounts for all nonsmoothness in use by caffe
+    GradientChecker(const Dtype stepsize, const Dtype threshold,
+        const unsigned int seed = 1701, const Dtype kink = 0.,
+        const Dtype kink_range = -1)
+        : stepsize_(stepsize), threshold_(threshold), seed_(seed), kink_(kink), kink_range_(
+            kink_range) {
+    }
+    // Checks the gradient of a layer, with provided bottom layers and top
+    // layers.
+    // Note that after the gradient check, we do not guarantee that the data
+    // stored in the layer parameters and the blobs are unchanged.
+    void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
+      layer->SetUp(bottom, top);
+      CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
+    }
+    void CheckGradientExhaustive(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+        int check_bottom = -1);
 
-		// CheckGradientEltwise can be used to test layers that perform element-wise
-		// computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
-		// i != j.
-		void CheckGradientEltwise(Layer<Dtype>* layer,
-			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    // CheckGradientEltwise can be used to test layers that perform element-wise
+    // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
+    // i != j.
+    void CheckGradientEltwise(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
-		void CheckGradientSingle(Layer<Dtype>* layer,
-			const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-			int check_bottom, int top_id, int top_data_id, bool element_wise = false);
+    void CheckGradientSingle(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+        int check_bottom, int top_id, int top_data_id,
+        bool element_wise = false);
 
-		// Checks the gradient of a network. This network should not have any data
-		// layers or loss layers, since the function does not explicitly deal with
-		// such cases yet. All input blobs and parameter blobs are going to be
-		// checked, layer-by-layer to avoid numerical problems to accumulate.
-		void CheckGradientNet(const Net<Dtype>& net,
-			const vector<Blob<Dtype>*>& input);
+    // Checks the gradient of a network. This network should not have any data
+    // layers or loss layers, since the function does not explicitly deal with
+    // such cases yet. All input blobs and parameter blobs are going to be
+    // checked, layer-by-layer to avoid numerical problems to accumulate.
+    void CheckGradientNet(const Net<Dtype>& net,
+        const vector<Blob<Dtype>*>& input);
 
-	protected:
-		Dtype GetObjAndGradient(const Layer<Dtype>& layer,
-			const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
-		Dtype stepsize_;
-		Dtype threshold_;
-		unsigned int seed_;
-		Dtype kink_;
-		Dtype kink_range_;
+  protected:
+    Dtype GetObjAndGradient(const Layer<Dtype>& layer,
+        const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
+    Dtype stepsize_;
+    Dtype threshold_;
+    unsigned int seed_;
+    Dtype kink_;
+    Dtype kink_range_;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-	int check_bottom, int top_id, int top_data_id, bool element_wise) {
-	if (element_wise) {
-		CHECK_EQ(0, layer->blobs().size());
-		CHECK_LE(0, top_id);
-		CHECK_LE(0, top_data_id);
-		const int top_count = top[top_id]->count();
-		for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
-			CHECK_EQ(top_count, bottom[blob_id]->count());
-		}
-	}
-	// First, figure out what blobs we need to check against, and zero init
-	// parameter blobs.
-	vector<Blob<Dtype>*> blobs_to_check;
-	vector<bool> propagate_down(bottom.size(), check_bottom < 0);
-	for (int i = 0; i < layer->blobs().size(); ++i) {
-		Blob<Dtype>* blob = layer->blobs()[i].get();
-		caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
-		blobs_to_check.push_back(blob);
-	}
-	if (check_bottom < 0) {
-		for (int i = 0; i < bottom.size(); ++i) {
-			blobs_to_check.push_back(bottom[i]);
-		}
-	} else {
-		CHECK_LT(check_bottom, bottom.size());
-		blobs_to_check.push_back(bottom[check_bottom]);
-		propagate_down[check_bottom] = true;
-	}
-	// Compute the gradient analytically using Backward
-	Caffe::set_random_seed(seed_);
-	// Ignore the loss from the layer (it's just the weighted sum of the losses
-	// from the top blobs, whose gradients we may want to test individually).
-	layer->Forward(bottom, top);
-	// Get additional loss from the objective
-	GetObjAndGradient(*layer, top, top_id, top_data_id);
-	layer->Backward(top, propagate_down, bottom);
-	// Store computed gradients for all checked blobs
-	vector < shared_ptr<Blob<Dtype> > >
-		computed_gradient_blobs(blobs_to_check.size());
-	for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-		Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-		computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
-		computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
-		const int count = blobs_to_check[blob_id]->count();
-		const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
-		Dtype* computed_gradients =
-			computed_gradient_blobs[blob_id]->mutable_cpu_data();
-		caffe_copy(count, diff, computed_gradients);
-	}
-	// Compute derivative of top w.r.t. each bottom and parameter input using
-	// finite differencing.
-	// LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
-	for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
-		Blob<Dtype>* current_blob = blobs_to_check[blob_id];
-		const Dtype* computed_gradients =
-			computed_gradient_blobs[blob_id]->cpu_data();
-		// LOG(ERROR) << "Blob " << blob_id << ": checking "
-		//     << current_blob->count() << " parameters.";
-		for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
-			// For an element-wise layer, we only need to do finite differencing to
-			// compute the derivative of top[top_id][top_data_id] w.r.t.
-			// bottom[blob_id][i] only for i == top_data_id.  For any other
-			// i != top_data_id, we know the derivative is 0 by definition, and simply
-			// check that that's true.
-			Dtype estimated_gradient = 0;
-			Dtype positive_objective = 0;
-			Dtype negative_objective = 0;
-			if (!element_wise || (feat_id == top_data_id)) {
-				// Do finite differencing.
-				// Compute loss with stepsize_ added to input.
-				current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-				Caffe::set_random_seed(seed_);
-				layer->Forward(bottom, top);
-				positive_objective =
-					GetObjAndGradient(*layer, top, top_id, top_data_id);
-				// Compute loss with stepsize_ subtracted from input.
-				current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
-				Caffe::set_random_seed(seed_);
-				layer->Forward(bottom, top);
-				negative_objective =
-					GetObjAndGradient(*layer, top, top_id, top_data_id);
-				// Recover original input value.
-				current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-				estimated_gradient = (positive_objective - negative_objective) /
-					stepsize_ / 2.;
-			}
-			Dtype computed_gradient = computed_gradients[feat_id];
-			Dtype feature = current_blob->cpu_data()[feat_id];
-			// LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
-			//     << current_blob->cpu_diff()[feat_id];
-			if (kink_ - kink_range_ > fabs(feature)
-				|| fabs(feature) > kink_ + kink_range_) {
-				// We check relative accuracy, but for too small values, we threshold
-				// the scale factor by 1.
-				Dtype scale = std::max(
-					std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
-				EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
-					<< "debug: (top_id, top_data_id, blob_id, feat_id)="
-					<< top_id << "," << top_data_id << "," << blob_id << "," << feat_id
-					<< "; feat = " << feature
-					<< "; objective+ = " << positive_objective
-					<< "; objective- = " << negative_objective;
-			}
-			// LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
-			// LOG(ERROR) << "computed gradient: " << computed_gradient
-			//    << " estimated_gradient: " << estimated_gradient;
-		}
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+    int check_bottom, int top_id, int top_data_id, bool element_wise) {
+  if (element_wise) {
+    CHECK_EQ(0, layer->blobs().size());
+    CHECK_LE(0, top_id);
+    CHECK_LE(0, top_data_id);
+    const int top_count = top[top_id]->count();
+    for (int blob_id = 0; blob_id < bottom.size(); ++blob_id) {
+      CHECK_EQ(top_count, bottom[blob_id]->count());
+    }
+  }
+  // First, figure out what blobs we need to check against, and zero init
+  // parameter blobs.
+  vector<Blob<Dtype>*> blobs_to_check;
+  vector<bool> propagate_down(bottom.size(), check_bottom < 0);
+  for (int i = 0; i < layer->blobs().size(); ++i) {
+    Blob<Dtype>* blob = layer->blobs()[i].get();
+    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
+    blobs_to_check.push_back(blob);
+  }
+  if (check_bottom < 0) {
+    for (int i = 0; i < bottom.size(); ++i) {
+      blobs_to_check.push_back(bottom[i]);
+    }
+  } else {
+    CHECK_LT(check_bottom, bottom.size());
+    blobs_to_check.push_back(bottom[check_bottom]);
+    propagate_down[check_bottom] = true;
+  }
+  // Compute the gradient analytically using Backward
+  Caffe::set_random_seed(seed_);
+  // Ignore the loss from the layer (it's just the weighted sum of the losses
+  // from the top blobs, whose gradients we may want to test individually).
+  layer->Forward(bottom, top);
+  // Get additional loss from the objective
+  GetObjAndGradient(*layer, top, top_id, top_data_id);
+  layer->Backward(top, propagate_down, bottom);
+  // Store computed gradients for all checked blobs
+  vector < shared_ptr<Blob<Dtype> >
+      > computed_gradient_blobs(blobs_to_check.size());
+  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
+    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
+    computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
+    computed_gradient_blobs[blob_id]->ReshapeLike(*current_blob);
+    const int count = blobs_to_check[blob_id]->count();
+    const Dtype* diff = blobs_to_check[blob_id]->cpu_diff();
+    Dtype* computed_gradients =
+        computed_gradient_blobs[blob_id]->mutable_cpu_data();
+    caffe_copy(count, diff, computed_gradients);
+  }
+  // Compute derivative of top w.r.t. each bottom and parameter input using
+  // finite differencing.
+  // LOG(ERROR) << "Checking " << blobs_to_check.size() << " blobs.";
+  for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
+    Blob<Dtype>* current_blob = blobs_to_check[blob_id];
+    const Dtype* computed_gradients =
+        computed_gradient_blobs[blob_id]->cpu_data();
+    // LOG(ERROR) << "Blob " << blob_id << ": checking "
+    //     << current_blob->count() << " parameters.";
+    for (int feat_id = 0; feat_id < current_blob->count(); ++feat_id) {
+      // For an element-wise layer, we only need to do finite differencing to
+      // compute the derivative of top[top_id][top_data_id] w.r.t.
+      // bottom[blob_id][i] only for i == top_data_id.  For any other
+      // i != top_data_id, we know the derivative is 0 by definition, and simply
+      // check that that's true.
+      Dtype estimated_gradient = 0;
+      Dtype positive_objective = 0;
+      Dtype negative_objective = 0;
+      if (!element_wise || (feat_id == top_data_id)) {
+        // Do finite differencing.
+        // Compute loss with stepsize_ added to input.
+        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
+        Caffe::set_random_seed(seed_);
+        layer->Forward(bottom, top);
+        positive_objective = GetObjAndGradient(*layer, top, top_id,
+            top_data_id);
+        // Compute loss with stepsize_ subtracted from input.
+        current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
+        Caffe::set_random_seed(seed_);
+        layer->Forward(bottom, top);
+        negative_objective = GetObjAndGradient(*layer, top, top_id,
+            top_data_id);
+        // Recover original input value.
+        current_blob->mutable_cpu_data()[feat_id] += stepsize_;
+        estimated_gradient = (positive_objective - negative_objective)
+            / stepsize_ / 2.;
+      }
+      Dtype computed_gradient = computed_gradients[feat_id];
+      Dtype feature = current_blob->cpu_data()[feat_id];
+      // LOG(ERROR) << "debug: " << current_blob->cpu_data()[feat_id] << " "
+      //     << current_blob->cpu_diff()[feat_id];
+      if (kink_ - kink_range_ > fabs(feature)
+          || fabs(feature) > kink_ + kink_range_) {
+        // We check relative accuracy, but for too small values, we threshold
+        // the scale factor by 1.
+        Dtype scale = std::max(
+            std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
+        EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
+            << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id
+            << "," << top_data_id << "," << blob_id << "," << feat_id
+            << "; feat = " << feature << "; objective+ = " << positive_objective
+            << "; objective- = " << negative_objective;
+      }
+      // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
+      // LOG(ERROR) << "computed gradient: " << computed_gradient
+      //    << " estimated_gradient: " << estimated_gradient;
+    }
+  }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void GradientChecker<Dtype>::CheckGradientExhaustive(Layer<Dtype>* layer,
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-	int check_bottom) {
-	layer->SetUp(bottom, top);
-	CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
-	// LOG(ERROR) << "Exhaustive Mode.";
-	for (int i = 0; i < top.size(); ++i) {
-		// LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
-		for (int j = 0; j < top[i]->count(); ++j) {
-			// LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
-			CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
-		}
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+    int check_bottom) {
+  layer->SetUp(bottom, top);
+  CHECK_GT(top.size(), 0) << "Exhaustive mode requires at least one top blob.";
+  // LOG(ERROR) << "Exhaustive Mode.";
+  for (int i = 0; i < top.size(); ++i) {
+    // LOG(ERROR) << "Exhaustive: blob " << i << " size " << top[i]->count();
+    for (int j = 0; j < top[i]->count(); ++j) {
+      // LOG(ERROR) << "Exhaustive: blob " << i << " data " << j;
+      CheckGradientSingle(layer, bottom, top, check_bottom, i, j);
+    }
+  }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void GradientChecker<Dtype>::CheckGradientEltwise(Layer<Dtype>* layer,
-	const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	layer->SetUp(bottom, top);
-	CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
-	const int check_bottom = -1;
-	const bool element_wise = true;
-	for (int i = 0; i < top.size(); ++i) {
-		for (int j = 0; j < top[i]->count(); ++j) {
-			CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
-		}
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  layer->SetUp(bottom, top);
+  CHECK_GT(top.size(), 0) << "Eltwise mode requires at least one top blob.";
+  const int check_bottom = -1;
+  const bool element_wise = true;
+  for (int i = 0; i < top.size(); ++i) {
+    for (int j = 0; j < top[i]->count(); ++j) {
+      CheckGradientSingle(layer, bottom, top, check_bottom, i, j, element_wise);
+    }
+  }
 }
 
-template<typename Dtype>
-void GradientChecker<Dtype>::CheckGradientNet(
-	const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
-	const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
-	vector < vector<Blob<Dtype>*> > &bottom_vecs = net.bottom_vecs();
-	vector < vector<Blob<Dtype>*> > &top_vecs = net.top_vecs();
-	for (int i = 0; i < layers.size(); ++i) {
-		net.Forward(input);
-		LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
-		CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
-	}
+template <typename Dtype>
+void GradientChecker<Dtype>::CheckGradientNet(const Net<Dtype>& net,
+    const vector<Blob<Dtype>*>& input) {
+  const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
+  vector < vector<Blob<Dtype>*> > &bottom_vecs = net.bottom_vecs();
+  vector < vector<Blob<Dtype>*> > &top_vecs = net.top_vecs();
+  for (int i = 0; i < layers.size(); ++i) {
+    net.Forward(input);
+    LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
+    CheckGradientExhaustive(*(layers[i].get()), bottom_vecs[i], top_vecs[i]);
+  }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 Dtype GradientChecker<Dtype>::GetObjAndGradient(const Layer<Dtype>& layer,
-	const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
-	Dtype loss = 0;
-	if (top_id < 0) {
-		// the loss will be half of the sum of squares of all outputs
-		for (int i = 0; i < top.size(); ++i) {
-			Blob<Dtype>* top_blob = top[i];
-			const Dtype* top_blob_data = top_blob->cpu_data();
-			Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-			int count = top_blob->count();
-			for (int j = 0; j < count; ++j) {
-				loss += top_blob_data[j] * top_blob_data[j];
-			}
-			// set the diff: simply the data.
-			caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
-		}
-		loss /= 2.;
-	} else {
-		// the loss will be the top_data_id-th element in the top_id-th blob.
-		for (int i = 0; i < top.size(); ++i) {
-			Blob<Dtype>* top_blob = top[i];
-			Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
-			caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
-		}
-		const Dtype loss_weight = 2;
-		loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
-		top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
-	}
-	return loss;
+    const vector<Blob<Dtype>*>& top, int top_id, int top_data_id) {
+  Dtype loss = 0;
+  if (top_id < 0) {
+    // the loss will be half of the sum of squares of all outputs
+    for (int i = 0; i < top.size(); ++i) {
+      Blob<Dtype>* top_blob = top[i];
+      const Dtype* top_blob_data = top_blob->cpu_data();
+      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
+      int count = top_blob->count();
+      for (int j = 0; j < count; ++j) {
+        loss += top_blob_data[j] * top_blob_data[j];
+      }
+      // set the diff: simply the data.
+      caffe_copy(top_blob->count(), top_blob_data, top_blob_diff);
+    }
+    loss /= 2.;
+  } else {
+    // the loss will be the top_data_id-th element in the top_id-th blob.
+    for (int i = 0; i < top.size(); ++i) {
+      Blob<Dtype>* top_blob = top[i];
+      Dtype* top_blob_diff = top_blob->mutable_cpu_diff();
+      caffe_set(top_blob->count(), Dtype(0), top_blob_diff);
+    }
+    const Dtype loss_weight = 2;
+    loss = top[top_id]->cpu_data()[top_data_id] * loss_weight;
+    top[top_id]->mutable_cpu_diff()[top_data_id] = loss_weight;
+  }
+  return loss;
 }
 
 }  // namespace caffe
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
index f5818f6f..f48be453 100644
--- a/include/caffe/util/benchmark.hpp
+++ b/include/caffe/util/benchmark.hpp
@@ -8,50 +8,50 @@
 namespace caffe {
 
 class Timer {
-	public:
-		Timer();
-		virtual ~Timer();
-		virtual void Start();
-		virtual void Stop();
-		virtual float MilliSeconds();
-		virtual float MicroSeconds();
-		virtual float Seconds();
-
-		inline bool initted() {
-			return initted_;
-		}
-		inline bool running() {
-			return running_;
-		}
-		inline bool has_run_at_least_once() {
-			return has_run_at_least_once_;
-		}
-
-	protected:
-		void Init();
-
-		bool initted_;
-		bool running_;
-		bool has_run_at_least_once_;
-		#ifndef CPU_ONLY
-		//cudaEvent_t start_gpu_;
-		//cudaEvent_t stop_gpu_;
+  public:
+    Timer();
+    virtual ~Timer();
+    virtual void Start();
+    virtual void Stop();
+    virtual float MilliSeconds();
+    virtual float MicroSeconds();
+    virtual float Seconds();
+
+    inline bool initted() {
+      return initted_;
+    }
+    inline bool running() {
+      return running_;
+    }
+    inline bool has_run_at_least_once() {
+      return has_run_at_least_once_;
+    }
+
+  protected:
+    void Init();
+
+    bool initted_;
+    bool running_;
+    bool has_run_at_least_once_;
+#ifndef CPU_ONLY
+    //cudaEvent_t start_gpu_;
+    //cudaEvent_t stop_gpu_;
 #endif
-		boost::posix_time::ptime start_cpu_;
-		boost::posix_time::ptime stop_cpu_;
-		float elapsed_milliseconds_;
-		float elapsed_microseconds_;
+    boost::posix_time::ptime start_cpu_;
+    boost::posix_time::ptime stop_cpu_;
+    float elapsed_milliseconds_;
+    float elapsed_microseconds_;
 };
 
 class CPUTimer: public Timer {
-	public:
-		explicit CPUTimer();
-		virtual ~CPUTimer() {
-		}
-		virtual void Start();
-		virtual void Stop();
-		virtual float MilliSeconds();
-		virtual float MicroSeconds();
+  public:
+    explicit CPUTimer();
+    virtual ~CPUTimer() {
+    }
+    virtual void Start();
+    virtual void Stop();
+    virtual float MilliSeconds();
+    virtual float MicroSeconds();
 };
 
 }  // namespace caffe
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index 1ff29356..1994c48a 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -15,116 +15,116 @@
   } while (0)
 
 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-	switch (status) {
-		case CUDNN_STATUS_SUCCESS:
-		return "CUDNN_STATUS_SUCCESS";
-		case CUDNN_STATUS_NOT_INITIALIZED:
-		return "CUDNN_STATUS_NOT_INITIALIZED";
-		case CUDNN_STATUS_ALLOC_FAILED:
-		return "CUDNN_STATUS_ALLOC_FAILED";
-		case CUDNN_STATUS_BAD_PARAM:
-		return "CUDNN_STATUS_BAD_PARAM";
-		case CUDNN_STATUS_INTERNAL_ERROR:
-		return "CUDNN_STATUS_INTERNAL_ERROR";
-		case CUDNN_STATUS_INVALID_VALUE:
-		return "CUDNN_STATUS_INVALID_VALUE";
-		case CUDNN_STATUS_ARCH_MISMATCH:
-		return "CUDNN_STATUS_ARCH_MISMATCH";
-		case CUDNN_STATUS_MAPPING_ERROR:
-		return "CUDNN_STATUS_MAPPING_ERROR";
-		case CUDNN_STATUS_EXECUTION_FAILED:
-		return "CUDNN_STATUS_EXECUTION_FAILED";
-		case CUDNN_STATUS_NOT_SUPPORTED:
-		return "CUDNN_STATUS_NOT_SUPPORTED";
-		case CUDNN_STATUS_LICENSE_ERROR:
-		return "CUDNN_STATUS_LICENSE_ERROR";
-	}
-	return "Unknown cudnn status";
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+    return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+    return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+    return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+    return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+    return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+    return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+    return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+    return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+    return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+    return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+    return "CUDNN_STATUS_LICENSE_ERROR";
+  }
+  return "Unknown cudnn status";
 }
 
 namespace caffe {
 
-	namespace cudnn {
-
-		template <typename Dtype> class dataType;
-		template<> class dataType<float> {
-			public:
-			static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-			static float oneval, zeroval;
-			static const void *one, *zero;
-		};
-		template<> class dataType<double> {
-			public:
-			static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-			static double oneval, zeroval;
-			static const void *one, *zero;
-		};
-
-		template <typename Dtype>
-		inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
-			CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
-		}
-
-		template <typename Dtype>
-		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-				int n, int c, int h, int w,
-				int stride_n, int stride_c, int stride_h, int stride_w) {
-			CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
-							n, c, h, w, stride_n, stride_c, stride_h, stride_w));
-		}
-
-		template <typename Dtype>
-		inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-				int n, int c, int h, int w) {
-			const int stride_w = 1;
-			const int stride_h = w * stride_w;
-			const int stride_c = h * stride_h;
-			const int stride_n = c * stride_c;
-			setTensor4dDesc<Dtype>(desc, n, c, h, w,
-					stride_n, stride_c, stride_h, stride_w);
-		}
-
-		template <typename Dtype>
-		inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
-				int n, int c, int h, int w) {
-			CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
-			CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-							n, c, h, w));
-		}
-
-		template <typename Dtype>
-		inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
-			CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
-		}
-
-		template <typename Dtype>
-		inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
-				cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
-				int pad_h, int pad_w, int stride_h, int stride_w) {
-			CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
-							pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
-		}
-
-		template <typename Dtype>
-		inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
-				PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
-				int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
-			switch (poolmethod) {
-				case PoolingParameter_PoolMethod_MAX:
-				*mode = CUDNN_POOLING_MAX;
-				break;
-				case PoolingParameter_PoolMethod_AVE:
-				*mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-				break;
-				default:
-				LOG(FATAL) << "Unknown pooling method.";
-			}
-			CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-			CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-							pad_h, pad_w, stride_h, stride_w));
-		}
-
-	}  // namespace cudnn
+  namespace cudnn {
+
+    template <typename Dtype> class dataType;
+    template<> class dataType<float> {
+      public:
+      static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+      static float oneval, zeroval;
+      static const void *one, *zero;
+    };
+    template<> class dataType<double> {
+      public:
+      static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+      static double oneval, zeroval;
+      static const void *one, *zero;
+    };
+
+    template <typename Dtype>
+    inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
+    }
+
+    template <typename Dtype>
+    inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+        int n, int c, int h, int w,
+        int stride_n, int stride_c, int stride_h, int stride_w) {
+      CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
+              n, c, h, w, stride_n, stride_c, stride_h, stride_w));
+    }
+
+    template <typename Dtype>
+    inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+        int n, int c, int h, int w) {
+      const int stride_w = 1;
+      const int stride_h = w * stride_w;
+      const int stride_c = h * stride_h;
+      const int stride_n = c * stride_c;
+      setTensor4dDesc<Dtype>(desc, n, c, h, w,
+          stride_n, stride_c, stride_h, stride_w);
+    }
+
+    template <typename Dtype>
+    inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
+        int n, int c, int h, int w) {
+      CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+      CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
+              n, c, h, w));
+    }
+
+    template <typename Dtype>
+    inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
+      CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
+    }
+
+    template <typename Dtype>
+    inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
+        cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
+        int pad_h, int pad_w, int stride_h, int stride_w) {
+      CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+              pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+    }
+
+    template <typename Dtype>
+    inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
+        PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
+        int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
+      switch (poolmethod) {
+        case PoolingParameter_PoolMethod_MAX:
+        *mode = CUDNN_POOLING_MAX;
+        break;
+        case PoolingParameter_PoolMethod_AVE:
+        *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        break;
+        default:
+        LOG(FATAL) << "Unknown pooling method.";
+      }
+      CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
+      CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
+              pad_h, pad_w, stride_h, stride_w));
+    }
+
+  }  // namespace cudnn
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp
index a65e3acf..a872fb07 100644
--- a/include/caffe/util/db.hpp
+++ b/include/caffe/util/db.hpp
@@ -10,48 +10,48 @@ namespace caffe {
 namespace db {
 
 enum Mode {
-	READ, WRITE, NEW
+  READ, WRITE, NEW
 };
 
 class Cursor {
-	public:
-		Cursor() {
-		}
-		virtual ~Cursor() {
-		}
-		virtual void SeekToFirst() = 0;
-		virtual void Next() = 0;
-		virtual string key() = 0;
-		virtual string value() = 0;
-		virtual bool valid() = 0;
-
-		DISABLE_COPY_AND_ASSIGN (Cursor);
+  public:
+    Cursor() {
+    }
+    virtual ~Cursor() {
+    }
+    virtual void SeekToFirst() = 0;
+    virtual void Next() = 0;
+    virtual string key() = 0;
+    virtual string value() = 0;
+    virtual bool valid() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (Cursor);
 };
 
 class Transaction {
-	public:
-		Transaction() {
-		}
-		virtual ~Transaction() {
-		}
-		virtual void Put(const string& key, const string& value) = 0;
-		virtual void Commit() = 0;
-
-		DISABLE_COPY_AND_ASSIGN (Transaction);
+  public:
+    Transaction() {
+    }
+    virtual ~Transaction() {
+    }
+    virtual void Put(const string& key, const string& value) = 0;
+    virtual void Commit() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (Transaction);
 };
 
 class DB {
-	public:
-		DB() {
-		}
-		virtual ~DB() {
-		}
-		virtual void Open(const string& source, Mode mode) = 0;
-		virtual void Close() = 0;
-		virtual Cursor* NewCursor() = 0;
-		virtual Transaction* NewTransaction() = 0;
-
-		DISABLE_COPY_AND_ASSIGN (DB);
+  public:
+    DB() {
+    }
+    virtual ~DB() {
+    }
+    virtual void Open(const string& source, Mode mode) = 0;
+    virtual void Close() = 0;
+    virtual Cursor* NewCursor() = 0;
+    virtual Transaction* NewTransaction() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (DB);
 };
 
 DB* GetDB(DataParameter::DB backend);
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index c63fdbb0..c0f6ab62 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -12,83 +12,80 @@ namespace caffe {
 namespace db {
 
 class LevelDBCursor: public Cursor {
-	public:
-		explicit LevelDBCursor(leveldb::Iterator* iter)
-		:
-				iter_(iter) {
-			SeekToFirst();
-		}
-		~LevelDBCursor() {
-			delete iter_;
-		}
-		virtual void SeekToFirst() {
-			iter_->SeekToFirst();
-		}
-		virtual void Next() {
-			iter_->Next();
-		}
-		virtual string key() {
-			return iter_->key().ToString();
-		}
-		virtual string value() {
-			return iter_->value().ToString();
-		}
-		virtual bool valid() {
-			return iter_->Valid();
-		}
+  public:
+    explicit LevelDBCursor(leveldb::Iterator* iter)
+        : iter_(iter) {
+      SeekToFirst();
+    }
+    ~LevelDBCursor() {
+      delete iter_;
+    }
+    virtual void SeekToFirst() {
+      iter_->SeekToFirst();
+    }
+    virtual void Next() {
+      iter_->Next();
+    }
+    virtual string key() {
+      return iter_->key().ToString();
+    }
+    virtual string value() {
+      return iter_->value().ToString();
+    }
+    virtual bool valid() {
+      return iter_->Valid();
+    }
 
-	private:
-		leveldb::Iterator* iter_;
+  private:
+    leveldb::Iterator* iter_;
 };
 
 class LevelDBTransaction: public Transaction {
-	public:
-		explicit LevelDBTransaction(leveldb::DB* db)
-		:
-				db_(db) {
-			CHECK_NOTNULL(db_);
-		}
-		virtual void Put(const string& key, const string& value) {
-			batch_.Put(key, value);
-		}
-		virtual void Commit() {
-			leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
-			CHECK(status.ok()) << "Failed to write batch to leveldb "
-					<< std::endl << status.ToString();
-		}
+  public:
+    explicit LevelDBTransaction(leveldb::DB* db)
+        : db_(db) {
+      CHECK_NOTNULL(db_);
+    }
+    virtual void Put(const string& key, const string& value) {
+      batch_.Put(key, value);
+    }
+    virtual void Commit() {
+      leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
+      CHECK(status.ok()) << "Failed to write batch to leveldb " << std::endl
+          << status.ToString();
+    }
 
-	private:
-		leveldb::DB* db_;
-		leveldb::WriteBatch batch_;
+  private:
+    leveldb::DB* db_;
+    leveldb::WriteBatch batch_;
 
-		DISABLE_COPY_AND_ASSIGN (LevelDBTransaction);
+    DISABLE_COPY_AND_ASSIGN (LevelDBTransaction);
 };
 
 class LevelDB: public DB {
-	public:
-		LevelDB()
-		:
-				db_(NULL) {
-		}
-		virtual ~LevelDB() {
-			Close();
-		}
-		virtual void Open(const string& source, Mode mode);
-		virtual void Close() {
-			if (db_ != NULL) {
-				delete db_;
-				db_ = NULL;
-			}
-		}
-		virtual LevelDBCursor* NewCursor() {
-			return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
-		}
-		virtual LevelDBTransaction* NewTransaction() {
-			return new LevelDBTransaction(db_);
-		}
+  public:
+    LevelDB()
+        : db_(NULL) {
+    }
+    virtual ~LevelDB() {
+      Close();
+    }
+    virtual void Open(const string& source, Mode mode);
+    virtual void Close() {
+      if (db_ != NULL) {
+        delete db_;
+        db_ = NULL;
+      }
+    }
+    virtual LevelDBCursor* NewCursor() {
+      return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
+    }
+    virtual LevelDBTransaction* NewTransaction() {
+      return new LevelDBTransaction(db_);
+    }
 
-	private:
-		leveldb::DB* db_;
+  private:
+    leveldb::DB* db_;
 };
 
 }  // namespace db
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index 68cbb93a..232b439a 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -11,96 +11,93 @@ namespace caffe {
 namespace db {
 
 inline void MDB_CHECK(int mdb_status) {
-	CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
 }
 
 class LMDBCursor: public Cursor {
-	public:
-		explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
-		:
-				mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
-			SeekToFirst();
-		}
-		virtual ~LMDBCursor() {
-			mdb_cursor_close(mdb_cursor_);
-			mdb_txn_abort(mdb_txn_);
-		}
-		virtual void SeekToFirst() {
-			Seek (MDB_FIRST);
-		}
-		virtual void Next() {
-			Seek (MDB_NEXT);
-		}
-		virtual string key() {
-			return string(static_cast<const char*>(mdb_key_.mv_data),
-					mdb_key_.mv_size);
-		}
-		virtual string value() {
-			return string(static_cast<const char*>(mdb_value_.mv_data),
-					mdb_value_.mv_size);
-		}
-		virtual bool valid() {
-			return valid_;
-		}
+  public:
+    explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
+        : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
+      SeekToFirst();
+    }
+    virtual ~LMDBCursor() {
+      mdb_cursor_close(mdb_cursor_);
+      mdb_txn_abort(mdb_txn_);
+    }
+    virtual void SeekToFirst() {
+      Seek (MDB_FIRST);
+    }
+    virtual void Next() {
+      Seek (MDB_NEXT);
+    }
+    virtual string key() {
+      return string(static_cast<const char*>(mdb_key_.mv_data),
+          mdb_key_.mv_size);
+    }
+    virtual string value() {
+      return string(static_cast<const char*>(mdb_value_.mv_data),
+          mdb_value_.mv_size);
+    }
+    virtual bool valid() {
+      return valid_;
+    }
 
-	private:
-		void Seek(MDB_cursor_op op) {
-			int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
-			if (mdb_status == MDB_NOTFOUND) {
-				valid_ = false;
-			} else {
-				MDB_CHECK(mdb_status);
-				valid_ = true;
-			}
-		}
+  private:
+    void Seek(MDB_cursor_op op) {
+      int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+      if (mdb_status == MDB_NOTFOUND) {
+        valid_ = false;
+      } else {
+        MDB_CHECK(mdb_status);
+        valid_ = true;
+      }
+    }
 
-		MDB_txn* mdb_txn_;
-		MDB_cursor* mdb_cursor_;
-		MDB_val mdb_key_, mdb_value_;
-		bool valid_;
+    MDB_txn* mdb_txn_;
+    MDB_cursor* mdb_cursor_;
+    MDB_val mdb_key_, mdb_value_;
+    bool valid_;
 };
 
 class LMDBTransaction: public Transaction {
-	public:
-		explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
-		:
-				mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
-		}
-		virtual void Put(const string& key, const string& value);
-		virtual void Commit() {
-			MDB_CHECK(mdb_txn_commit(mdb_txn_));
-		}
+  public:
+    explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
+        : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
+    }
+    virtual void Put(const string& key, const string& value);
+    virtual void Commit() {
+      MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    }
 
-	private:
-		MDB_dbi* mdb_dbi_;
-		MDB_txn* mdb_txn_;
+  private:
+    MDB_dbi* mdb_dbi_;
+    MDB_txn* mdb_txn_;
 
-		DISABLE_COPY_AND_ASSIGN (LMDBTransaction);
+    DISABLE_COPY_AND_ASSIGN (LMDBTransaction);
 };
 
 class LMDB: public DB {
-	public:
-		LMDB()
-		:
-				mdb_env_(NULL) {
-		}
-		virtual ~LMDB() {
-			Close();
-		}
-		virtual void Open(const string& source, Mode mode);
-		virtual void Close() {
-			if (mdb_env_ != NULL) {
-				mdb_dbi_close(mdb_env_, mdb_dbi_);
-				mdb_env_close(mdb_env_);
-				mdb_env_ = NULL;
-			}
-		}
-		virtual LMDBCursor* NewCursor();
-		virtual LMDBTransaction* NewTransaction();
+  public:
+    LMDB()
+        : mdb_env_(NULL) {
+    }
+    virtual ~LMDB() {
+      Close();
+    }
+    virtual void Open(const string& source, Mode mode);
+    virtual void Close() {
+      if (mdb_env_ != NULL) {
+        mdb_dbi_close(mdb_env_, mdb_dbi_);
+        mdb_env_close(mdb_env_);
+        mdb_env_ = NULL;
+      }
+    }
+    virtual LMDBCursor* NewCursor();
+    virtual LMDBTransaction* NewTransaction();
 
-	private:
-		MDB_env* mdb_env_;
-		MDB_dbi mdb_dbi_;
+  private:
+    MDB_env* mdb_env_;
+    MDB_dbi mdb_dbi_;
 };
 
 }  // namespace db
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index ee7ea10b..f962049d 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -30,83 +30,73 @@
 namespace caffe {
 
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, Dtype* data_col);
+void im2col_cpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
-void col2im_cpu(const Dtype* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, Dtype* data_im);
+void col2im_cpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_im, const int img_offset);
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
+    const int width, const int channels, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_col, const int col_offset);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, Dtype* data_col);
+void im2col_gpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, Dtype* data_im);
+void col2im_gpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
 void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, const int col_offset);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
 void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, const int col_offset);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, const int col_offset, int optnum);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_col, const int col_offset,
+    int optnum);
 
 template <typename Dtype>
 void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int psize, const int pad,
-		const int stride, Dtype* data_im, const int img_offset);
+    const int channels, const int height, const int width, const int psize,
+    const int pad, const int stride, Dtype* data_im, const int img_offset);
 
 template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_im, const int img_offset, int optnum);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_im, const int img_offset,
+    int optnum);
 
 template <typename Dtype>
-void col2im_gpu_ocl(cl_mem data_col, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_im, cl_kernel Kernel);
+void col2im_gpu_ocl(cl_mem data_col, const int channels, const int height,
+    const int width, const int ksize, const int pad, const int stride,
+    Dtype* data_im, cl_kernel Kernel);
 
 template <typename Dtype>
-void im2col_gpu_ocl(cl_mem data_im, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, cl_kernel Kernel);
+void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height,
+    const int width, const int ksize, const int pad, const int stride,
+    Dtype* data_col, cl_kernel Kernel);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/insert_splits.hpp b/include/caffe/util/insert_splits.hpp
index c9a40c54..446abb81 100644
--- a/include/caffe/util/insert_splits.hpp
+++ b/include/caffe/util/insert_splits.hpp
@@ -12,14 +12,14 @@ namespace caffe {
 void InsertSplits(const NetParameter& param, NetParameter* param_split);
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-		const int blob_idx, const int split_count, const float loss_weight,
-		LayerParameter* split_layer_param);
+    const int blob_idx, const int split_count, const float loss_weight,
+    LayerParameter* split_layer_param);
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-		const int blob_idx);
+    const int blob_idx);
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-		const int blob_idx, const int split_idx);
+    const int blob_idx, const int split_idx);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 7bd1d2db..c04cce6a 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -19,118 +19,116 @@ namespace caffe {
 using ::google::protobuf::Message;
 
 inline void MakeTempFilename(string* temp_filename) {
-	temp_filename->clear();
-	*temp_filename = "/tmp/caffe_test.XXXXXX";
-	char* temp_filename_cstr = new char[temp_filename->size() + 1];
-	// NOLINT_NEXT_LINE(runtime/printf)
-	strcpy(temp_filename_cstr, temp_filename->c_str());
-	int fd = mkstemp(temp_filename_cstr);
-	CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename;
-	close(fd);
-	*temp_filename = temp_filename_cstr;
-	delete[] temp_filename_cstr;
+  temp_filename->clear();
+  *temp_filename = "/tmp/caffe_test.XXXXXX";
+  char* temp_filename_cstr = new char[temp_filename->size() + 1];
+  // NOLINT_NEXT_LINE(runtime/printf)
+  strcpy(temp_filename_cstr, temp_filename->c_str());
+  int fd = mkstemp(temp_filename_cstr);
+  CHECK_GE(fd, 0) << "Failed to open a temporary file at: " << *temp_filename;
+  close(fd);
+  *temp_filename = temp_filename_cstr;
+  delete[] temp_filename_cstr;
 }
 
 inline void MakeTempDir(string* temp_dirname) {
-	temp_dirname->clear();
-	*temp_dirname = "/tmp/caffe_test.XXXXXX";
-	char* temp_dirname_cstr = new char[temp_dirname->size() + 1];
-	// NOLINT_NEXT_LINE(runtime/printf)
-	strcpy(temp_dirname_cstr, temp_dirname->c_str());
-	char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
-	CHECK(mkdtemp_result != NULL)
-			<< "Failed to create a temporary directory at: " << *temp_dirname;
-	*temp_dirname = temp_dirname_cstr;
-	delete[] temp_dirname_cstr;
+  temp_dirname->clear();
+  *temp_dirname = "/tmp/caffe_test.XXXXXX";
+  char* temp_dirname_cstr = new char[temp_dirname->size() + 1];
+  // NOLINT_NEXT_LINE(runtime/printf)
+  strcpy(temp_dirname_cstr, temp_dirname->c_str());
+  char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
+  CHECK(mkdtemp_result != NULL) << "Failed to create a temporary directory at: "
+      << *temp_dirname;
+  *temp_dirname = temp_dirname_cstr;
+  delete[] temp_dirname_cstr;
 }
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromTextFile(const string& filename, Message* proto) {
-	return ReadProtoFromTextFile(filename.c_str(), proto);
+  return ReadProtoFromTextFile(filename.c_str(), proto);
 }
 
 inline void ReadProtoFromTextFileOrDie(const char* filename, Message* proto) {
-	CHECK(ReadProtoFromTextFile(filename, proto));
+  CHECK(ReadProtoFromTextFile(filename, proto));
 }
 
 inline void ReadProtoFromTextFileOrDie(const string& filename, Message* proto) {
-	ReadProtoFromTextFileOrDie(filename.c_str(), proto);
+  ReadProtoFromTextFileOrDie(filename.c_str(), proto);
 }
 
 void WriteProtoToTextFile(const Message& proto, const char* filename);
 inline void WriteProtoToTextFile(const Message& proto, const string& filename) {
-	WriteProtoToTextFile(proto, filename.c_str());
+  WriteProtoToTextFile(proto, filename.c_str());
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, Message* proto);
 
 inline bool ReadProtoFromBinaryFile(const string& filename, Message* proto) {
-	return ReadProtoFromBinaryFile(filename.c_str(), proto);
+  return ReadProtoFromBinaryFile(filename.c_str(), proto);
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) {
-	CHECK(ReadProtoFromBinaryFile(filename, proto));
+  CHECK(ReadProtoFromBinaryFile(filename, proto));
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const string& filename,
-		Message* proto) {
-	ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
+    Message* proto) {
+  ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
 }
 
 void WriteProtoToBinaryFile(const Message& proto, const char* filename);
-inline void WriteProtoToBinaryFile(
-		const Message& proto, const string& filename) {
-	WriteProtoToBinaryFile(proto, filename.c_str());
+inline void WriteProtoToBinaryFile(const Message& proto,
+    const string& filename) {
+  WriteProtoToBinaryFile(proto, filename.c_str());
 }
 
 bool ReadFileToDatum(const string& filename, const int label, Datum* datum);
 
 inline bool ReadFileToDatum(const string& filename, Datum* datum) {
-	return ReadFileToDatum(filename, -1, datum);
+  return ReadFileToDatum(filename, -1, datum);
 }
 
-bool ReadImageToDatum(const string& filename, const int label,
-		const int height, const int width, const bool is_color,
-		const std::string & encoding, Datum* datum);
+bool ReadImageToDatum(const string& filename, const int label, const int height,
+    const int width, const bool is_color, const std::string & encoding,
+    Datum* datum);
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-		const int height, const int width, const bool is_color, Datum* datum) {
-	return ReadImageToDatum(filename, label, height, width, is_color,
-			"", datum);
+    const int height, const int width, const bool is_color, Datum* datum) {
+  return ReadImageToDatum(filename, label, height, width, is_color, "", datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-		const int height, const int width, Datum* datum) {
-	return ReadImageToDatum(filename, label, height, width, true, datum);
+    const int height, const int width, Datum* datum) {
+  return ReadImageToDatum(filename, label, height, width, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-		const bool is_color, Datum* datum) {
-	return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
+    const bool is_color, Datum* datum) {
+  return ReadImageToDatum(filename, label, 0, 0, is_color, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-		Datum* datum) {
-	return ReadImageToDatum(filename, label, 0, 0, true, datum);
+    Datum* datum) {
+  return ReadImageToDatum(filename, label, 0, 0, true, datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
-		const std::string & encoding, Datum* datum) {
-	return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
+    const std::string & encoding, Datum* datum) {
+  return ReadImageToDatum(filename, label, 0, 0, true, encoding, datum);
 }
 
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const int height, const int width, const bool is_color);
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const bool is_color);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const int height, const int width);
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const bool is_color);
+cv::Mat ReadImageToCVMat(const string& filename, const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename);
 
@@ -140,18 +138,16 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
 
 template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-		Blob<Dtype>* blob);
+void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_,
+    int min_dim, int max_dim, Blob<Dtype>* blob);
 
 template <typename Dtype>
-void hdf5_load_nd_dataset(
-		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-		Blob<Dtype>* blob);
+void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim,
+    int max_dim, Blob<Dtype>* blob);
 
 template <typename Dtype>
-void hdf5_save_nd_dataset(
-		const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
+void hdf5_save_nd_dataset(const hid_t file_id, const string& dataset_name,
+    const Blob<Dtype>& blob);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 8a36069a..d7c67673 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -41,71 +41,64 @@ namespace caffe {
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename Dtype>
-void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-		Dtype* C);
+void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const Dtype alpha, const Dtype* A,
+    const Dtype* B, const Dtype beta, Dtype* C);
 
 // Decaf gpu gemm provides an interface that is almost the same as the cpu
 // gemm function - following the c convention and calling the fortran-order
 // gpu code under the hood.
 template <typename Dtype>
-void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-		Dtype* C);
+void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const Dtype alpha, const Dtype* A,
+    const Dtype* B, const Dtype beta, Dtype* C);
 
 template <typename Dtype>
 cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
-		const int offB, const Dtype beta,
-		Dtype* C, const int offC);
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+    const int offB, const Dtype beta, Dtype* C, const int offC);
 /*This is Yuan Gao's sgemm_ex*/
 template <typename Dtype>
 void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-		Dtype* C, const int offset1, const int offset2, const int offset3);
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C, const int offset1, const int offset2, const int offset3);
 
 template <typename Dtype>
 cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
-		const int offB, const Dtype beta,
-		Dtype* C, const int offC);
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+    const int offB, const Dtype beta, Dtype* C, const int offC);
 
 template <typename Dtype>
 void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-		const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-		Dtype* y);
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const Dtype alpha, const Dtype* A, size_t offA, int lda,
-		const Dtype * x, size_t offx, const Dtype beta, int incx,
-		Dtype* y, size_t offy, int incy);
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, size_t offA, int lda, const Dtype * x,
+    size_t offx, const Dtype beta, int incx, Dtype* y, size_t offy, int incy);
 
 template <typename Dtype>
 void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-		const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-		Dtype* y);
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
 
 template <typename Dtype>
-void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
-		Dtype* Y);
+void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
 
 template <typename Dtype>
-void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-		Dtype* Y);
+void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
 
 template <typename Dtype>
 void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-		const Dtype beta, Dtype* Y);
+    const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
 void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-		const Dtype beta, Dtype* Y);
+    const Dtype beta, Dtype* Y);
 
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype *X, Dtype *Y);
@@ -117,14 +110,14 @@ template <typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
 
 inline void caffe_memset(const size_t N, const int alpha, void* X) {
-	memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
+  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
 }
 
 inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 #ifndef CPU_ONLY
-	ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N);
+  ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N);
 #else
-	NO_GPU;
+  NO_GPU;
 #endif
 }
 
@@ -144,7 +137,7 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
 template <typename Dtype>
 void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha,
-		Dtype *X);
+    Dtype *X);
 
 template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
@@ -176,7 +169,7 @@ void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
 void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a,
-		const Dtype* b, Dtype* y);
+    const Dtype* b, Dtype* y);
 
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
@@ -207,11 +200,11 @@ void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
 template <typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-		Dtype* r);
+    Dtype* r);
 
 template <typename Dtype>
 void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-		Dtype* r);
+    Dtype* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
@@ -236,7 +229,7 @@ int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
 template <typename Dtype>
 uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
-		const Dtype* y);
+    const Dtype* y);
 
 // Returns the sum of the absolute values of the elements of vector x
 template <typename Dtype>
@@ -249,7 +242,7 @@ void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
 template <typename Dtype>
 inline char caffe_sign(Dtype val) {
-	return (Dtype(0) < val) - (val < Dtype(0));
+  return (Dtype(0) < val) - (val < Dtype(0));
 }
 
 // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
@@ -345,7 +338,7 @@ void caffe_log(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
 Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
-		const Dtype* y, const int incy);
+    const Dtype* y, const int incy);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 06262fbf..2ca24374 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -81,16 +81,14 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
 // in standard blas. We will simply use a two-step (inefficient, of course) way
 // to mimic that.
 inline void cblas_saxpby(const int N, const float alpha, const float* X,
-		const int incX, const float beta, float* Y,
-		const int incY) {
-	cblas_sscal(N, beta, Y, incY);
-	cblas_saxpy(N, alpha, X, incX, Y, incY);
+    const int incX, const float beta, float* Y, const int incY) {
+  cblas_sscal(N, beta, Y, incY);
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
 }
 inline void cblas_daxpby(const int N, const double alpha, const double* X,
-		const int incX, const double beta, double* Y,
-		const int incY) {
-	cblas_dscal(N, beta, Y, incY);
-	cblas_daxpy(N, alpha, X, incX, Y, incY);
+    const int incX, const double beta, double* Y, const int incY) {
+  cblas_dscal(N, beta, Y, incY);
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
 
 #endif  // USE_MKL
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 9febaa04..776fec11 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -33,7 +33,7 @@ template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
-		const int count);
+    const int count);
 
 void eventCallback(cl_event event, cl_int event_status, void * user_data);
 }  // namespace caffe
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 869bc83b..25a86090 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -31,189 +31,188 @@ namespace caffe {
 
 typedef unsigned int uint32_t;
 
-template <typename dtype> inline std::string get_dtype_suffix()
-{
-	dtype x;
-	const char type = typeid(x).name()[0];
-	std::string suffix;
-	switch (type) {
-		case 'i':
-			suffix = "_int";
-			break;
-		case 'd':
-			suffix = "_double";
-			break;
-		case 'f':
-			default:
-			suffix = "_float";
-	}
-	return suffix;
+template <typename dtype> inline std::string get_dtype_suffix() {
+  dtype x;
+  const char type = typeid(x).name()[0];
+  std::string suffix;
+  switch (type) {
+  case 'i':
+    suffix = "_int";
+    break;
+  case 'd':
+    suffix = "_double";
+    break;
+  case 'f':
+  default:
+    suffix = "_float";
+  }
+  return suffix;
 }
 
 template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
-		const int M_, const int packing_num);
+    const int M_, const int packing_num);
 
 template <typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-		const int height, const int width, Dtype* data_opt, const int opt_offset,
-		const int optnum);
+    const int height, const int width, Dtype* data_opt, const int opt_offset,
+    const int optnum);
 
 template <typename Dtype>
 void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* bottom_data, Dtype* scale_data);
+    const Dtype* bottom_data, Dtype* scale_data);
 
 template <typename Dtype>
 void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out);
 
 template <typename Dtype>
 void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* scale, Dtype* data);
+    const Dtype* scale, Dtype* data);
 
 template <typename Dtype>
 Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
+    const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
 
 template <typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data);
 
 template <typename Dtype>
 void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data,
-		const Dtype* label);
+    const Dtype* label);
 
 template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		Dtype* top_data);
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    Dtype* top_data);
 
 template <typename Dtype>
 void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
-		const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
-		Dtype* top_mask);
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+    Dtype* top_mask);
 
 template <typename Dtype>
 void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-		const int* const mask, const Dtype* const top_mask, const int num,
-		const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-		const int pad_w, Dtype* const bottom_diff);
+    const int* const mask, const Dtype* const top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff);
 
 template <typename Dtype>
 void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-		const int num, const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-		const int pad_w, Dtype* const bottom_diff);
+    const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff);
 
 template <typename Dtype>
 void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
-		const Dtype* const top_diff, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, Dtype* const bottom_diff);
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, Dtype* const bottom_diff);
 template <typename Dtype>
 void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
 template <typename Dtype>
 void SigmoidBackward(const int count, const Dtype* top_diff,
-		const Dtype* top_data, Dtype* bottom_diff);
+    const Dtype* top_data, Dtype* bottom_diff);
 
 template <typename Dtype>
 void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data);
 
 template <typename Dtype>
 void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
-		Dtype* bottom_diff);
+    Dtype* bottom_diff);
 
 template <typename Dtype>
 void ThresholdForward(const int count, const Dtype threshold,
-		const Dtype* bottom_data, Dtype* top_data);
+    const Dtype* bottom_data, Dtype* top_data);
 
 template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, Dtype* top_data);
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, Dtype* top_data);
 
 template <typename Dtype>
 void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
-		const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, Dtype* top_data);
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data);
 
 template <typename Dtype>
 void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		Dtype* idx_data, Dtype* top_data);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* idx_data, Dtype* top_data);
 
 template <typename Dtype>
 void StoPoolForwardTest(const int count, const Dtype* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		Dtype* top_data);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* top_data);
 
 template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, Dtype* bottom_diff);
+    const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, Dtype* bottom_diff);
 
 template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
-		const int clnum, const int channels_, const int intheight_,
-		const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, const int pad_, Dtype* bottom_diff);
+    const int clnum, const int channels_, const int intheight_,
+    const int width_, const int pooled_height_, const int pooled_width_,
+    const int kernel_size_, const int stride_, const int pad_,
+    Dtype* bottom_diff);
 
 template <typename Dtype>
 void PReLUForward(const int count, const int channels, const int dim,
-		const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
-		const int div_factor);
+    const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+    const int div_factor);
 
 template <typename Dtype>
 void PReLUBackward(const int count, const int channels, const int dim,
-		const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
-		const Dtype* slope_data, const int div_factor);
+    const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+    const Dtype* slope_data, const int div_factor);
 
 template <typename Dtype>
 void PReLUParamBackward(const int count, const Dtype* top_diff,
-		const int offset_out, const Dtype* bottom_data, const int offset_in,
-		Dtype* bottom_diff);
+    const int offset_out, const Dtype* bottom_data, const int offset_in,
+    Dtype* bottom_diff);
 
 template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
-		Dtype negative_slope);
+    Dtype negative_slope);
 
 template <typename Dtype>
 void ReLUBackward(const int count, const Dtype* top_diff,
-		const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+    const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
 
 template <typename Dtype>
 void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data,
-		const int* MaskMem, const Dtype scale_, Dtype *top_data);
+    const int* MaskMem, const Dtype scale_, Dtype *top_data);
 
 template <typename Dtype>
 void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-		const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+    const float threshold_, const Dtype scale_, Dtype* bottom_diff);
 
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
-		Dtype threshold);
+    Dtype threshold);
 
 template <typename Dtype>
 void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup);
@@ -222,23 +221,22 @@ template <typename Dtype>
 void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V);
 
 template <typename Dtype>
-void caffe_gpu_abs_ocl(const int N,  const Dtype* X, Dtype * Y );
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
 
 template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
 
 template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-		const int spatial_dim, const Dtype* data, Dtype* out);
+    const int spatial_dim, const Dtype* data, Dtype* out);
 
 template <typename Dtype>
-void kernel_channel_subtract(const int count,
-		const int num, const int channels,
-		const int spatial_dim, const Dtype* channel_max, Dtype* data);
+void kernel_channel_subtract(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data);
 
 template <typename Dtype>
 void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
-		Dtype* out);
+    Dtype* out);
 
 template <typename Dtype>
 void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out);
@@ -263,29 +261,28 @@ void kernel_exp(const int count, const Dtype* data, Dtype* out);
 
 template <typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-		const int spatial_dim, const Dtype* data, Dtype* channel_sum);
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum);
 
 template <typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
-		const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data);
 
 template <typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-		const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-		Dtype* channel_dot);
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot);
 
 template <typename Dtype>
-void SoftmaxLossForwardGPU(const int nthreads,
-		const Dtype* prob_data, const Dtype* label, Dtype* loss,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_,
-		Dtype* counts);
+void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data,
+    const Dtype* label, Dtype* loss, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts);
 
 template <typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-		const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-		const int spatial_dim, const bool has_ignore_label_,
-		const int ignore_label_, Dtype* counts);
+    const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts);
 
 template <typename Dtype>
 void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
@@ -294,22 +291,21 @@ template <typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
 
 template <typename Dtype>
-void LRNFillScale(const int nthreads, const Dtype* const in,
-		const int num, const int channels, const int height,
-		const int width, const int size, const Dtype alpha_over_size,
-		const Dtype k, Dtype* const scale);
+void LRNFillScale(const int nthreads, const Dtype* const in, const int num,
+    const int channels, const int height, const int width, const int size,
+    const Dtype alpha_over_size, const Dtype k, Dtype* const scale);
 
 template <typename Dtype>
-void LRNComputeOutput(int nthreads, const Dtype* in,
-		Dtype* scale, Dtype negative_beta, Dtype* out);
+void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale,
+    Dtype negative_beta, Dtype* out);
 
 template <typename Dtype>
-void LRNComputeDiff(const int nthreads,
-		const Dtype* const bottom_data, const Dtype* const top_data,
-		const Dtype* const scale, const Dtype* const top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int size, const Dtype negative_beta,
-		const Dtype cache_ratio, Dtype* const bottom_diff);
+void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data,
+    const Dtype* const top_data, const Dtype* const scale,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int size,
+    const Dtype negative_beta, const Dtype cache_ratio,
+    Dtype* const bottom_diff);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
 
@@ -321,28 +317,26 @@ void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
 
 template <typename Dtype>
 void BNLLBackward(const int count, const Dtype* top_diff,
-		const Dtype* bottom_data, Dtype *bottom_diff);
+    const Dtype* bottom_data, Dtype *bottom_diff);
 
 template <typename Dtype>
 void Concat(const int nthreads, const Dtype* in_data, const bool forward,
-		const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, Dtype *out_data);
+    const int num_concats, const int concat_size, const int top_concat_axis,
+    const int bottom_concat_axis, const int offset_concat_axis,
+    Dtype *out_data);
 
 template <typename Dtype>
-void CLLBackward(const int count, const int channels,
-		const Dtype margin, const bool legacy_version, const Dtype alpha,
-		const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-		Dtype *bottom_diff);
+void CLLBackward(const int count, const int channels, const Dtype margin,
+    const bool legacy_version, const Dtype alpha, const Dtype* y,
+    const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff);
 
 template <typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-		const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-		int* mask);
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, int* mask);
 
 template <typename Dtype>
-void MaxBackward(const int nthreads, const Dtype* top_diff,
-		const int blob_idx, const int* mask, Dtype* bottom_diff);
+void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
+    const int* mask, Dtype* bottom_diff);
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
 // namespace caffe
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index 7688e16a..febd932d 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -14,30 +14,29 @@ namespace caffe {
 typedef boost::mt19937 rng_t;
 
 inline rng_t* caffe_rng() {
-	return static_cast<caffe::rng_t*>(Caffe::rng_stream().generator());
+  return static_cast<caffe::rng_t*>(Caffe::rng_stream().generator());
 }
 
 // Fisher–Yates algorithm
 template <class RandomAccessIterator, class RandomGenerator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
-		RandomGenerator* gen) {
-	typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
-	difference_type;
-	typedef typename boost::uniform_int<difference_type> dist_type;
-
-	difference_type length = std::distance(begin, end);
-	if (length <= 0)
-		return;
-
-	for (difference_type i = length - 1; i > 0; --i) {
-		dist_type dist(0, i);
-		std::iter_swap(begin + i, begin + dist(*gen));
-	}
+    RandomGenerator* gen) {
+  typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
+  typedef typename boost::uniform_int<difference_type> dist_type;
+
+  difference_type length = std::distance(begin, end);
+  if (length <= 0)
+    return;
+
+  for (difference_type i = length - 1; i > 0; --i) {
+    dist_type dist(0, i);
+    std::iter_swap(begin + i, begin + dist(*gen));
+  }
 }
 
 template <class RandomAccessIterator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end) {
-	shuffle(begin, end, caffe_rng());
+  shuffle(begin, end, caffe_rng());
 }
 }  // namespace caffe
 
diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index 2dc3cceb..496ba1e0 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param);
 // taking its top blob as input.
 // Error if any of these above layers are not-conv layers.
 void UpgradeV0PaddingLayers(const NetParameter& param,
-		NetParameter* param_upgraded_pad);
+    NetParameter* param_upgraded_pad);
 
 // Upgrade a single V0LayerConnection to the V1LayerParameter format.
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-		V1LayerParameter* layer_param);
+    V1LayerParameter* layer_param);
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type);
 
@@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param);
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param);
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-		LayerParameter* layer_param);
+    LayerParameter* layer_param);
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type);
 
@@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
 
 // Read parameters from a file into a NetParameter proto message.
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-		NetParameter* param);
+    NetParameter* param);
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-		NetParameter* param);
+    NetParameter* param);
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 0c954fa2..bc6cd5de 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -22,149 +22,146 @@ namespace caffe {
  */
 template <typename Dtype>
 class BaseConvolutionLayer: public Layer<Dtype> {
-	public:
-		explicit BaseConvolutionLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual ~BaseConvolutionLayer();
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline int MinBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-		virtual inline bool EqualNumBottomTopBlobs() const {
-			return true;
-		}
-
-	protected:
-		// Helper functions that abstract away the column buffer and gemm arguments.
-		// The last argument in forward_cpu_gemm is so that we can skip the im2col if
-		// we just called weight_cpu_gemm with the same input.
-		void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
-				Dtype* output, bool skip_im2col = false);
-		void forward_cpu_bias(Dtype* output, const Dtype* bias);
-		void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
-				Dtype* output);
-		void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
-				weights);
-		void backward_cpu_bias(Dtype* bias, const Dtype* input);
-		//opencl related setup
-		void ocl_setup();
+  public:
+    explicit BaseConvolutionLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual ~BaseConvolutionLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int MinBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline bool EqualNumBottomTopBlobs() const {
+      return true;
+    }
+
+  protected:
+    // Helper functions that abstract away the column buffer and gemm arguments.
+    // The last argument in forward_cpu_gemm is so that we can skip the im2col if
+    // we just called weight_cpu_gemm with the same input.
+    void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_cpu_bias(Dtype* output, const Dtype* bias);
+    void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* output);
+    void weight_cpu_gemm(const Dtype* input, const Dtype* output,
+        Dtype* weights);
+    void backward_cpu_bias(Dtype* bias, const Dtype* input);
+    //opencl related setup
+    void ocl_setup();
 
 #ifndef CPU_ONLY
-		void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
-				Dtype* output, bool skip_im2col = false);
-		void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
-				Dtype* output, bool skip_im2col = false);
-		void forward_gpu_bias(Dtype* output, const Dtype* bias);
-		void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
-		void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
-				Dtype* col_output);
-		void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
-				Dtype* col_output);
-		void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
-				weights);
-		void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, Dtype*
-				weights);
-		void backward_gpu_bias(Dtype* bias, const Dtype* input);
-		#endif
-
-		// reverse_dimensions should return true iff we are implementing deconv, so
-		// that conv helpers know which dimensions are which.
-		virtual bool reverse_dimensions() = 0;
-		// Compute height_out_ and width_out_ from other parameters.
-		virtual void compute_output_shape() = 0;
-
-		int kernel_h_, kernel_w_;
-		int stride_h_, stride_w_;
-		int num_;
-		int channels_;
-		int pad_h_, pad_w_;
-		int height_, width_;
-		int group_;
-		int num_output_;
-		int height_out_, width_out_;
-		bool bias_term_;
-		bool is_1x1_;
-
-	private:
-		// wrap im2col/col2im so we don't have to remember the (long) argument lists
-		inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
-			im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
-		}
-		inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
-			col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
-		}
+    void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_gpu_bias(Dtype* output, const Dtype* bias);
+    void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
+    void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* col_output);
+    void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
+        Dtype* col_output);
+    void weight_gpu_gemm(const Dtype* col_input, const Dtype* output,
+        Dtype* weights);
+    void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output,
+        Dtype* weights);
+    void backward_gpu_bias(Dtype* bias, const Dtype* input);
+#endif
+
+    // reverse_dimensions should return true iff we are implementing deconv, so
+    // that conv helpers know which dimensions are which.
+    virtual bool reverse_dimensions() = 0;
+    // Compute height_out_ and width_out_ from other parameters.
+    virtual void compute_output_shape() = 0;
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int num_;
+    int channels_;
+    int pad_h_, pad_w_;
+    int height_, width_;
+    int group_;
+    int num_output_;
+    int height_out_, width_out_;
+    bool bias_term_;
+    bool is_1x1_;
+
+  private:
+    // wrap im2col/col2im so we don't have to remember the (long) argument lists
+    inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
+      im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+          kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+    }
+    inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
+      col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+          kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+    }
 #ifndef CPU_ONLY
-		inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-			im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_,
-					conv_in_width_,
-					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff,
-					0);
-		}
-		inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-			col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_,
-					conv_in_width_,
-					kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data,
-					bottom_offset_);
-		}
-	protected:
-		inline void conv_im2col_gpu_opt(const Dtype* data) {
-			im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
-					conv_in_width_,
-					kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0, opt_num2);
-		}
-		inline void conv_col2im_gpu_opt(Dtype* data) {
-			col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
-					conv_in_width_,
-					kernel_h_, pad_h_, stride_w_, data, bottom_offset_, opt_num2);
-		}
-	private:
-		inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
-			transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_,
-					M_ * opt_num2, opt_num2);
-		}
-		inline void conv_transpose_gpu(const Dtype* data) {
-			opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
-					opt_num2);
-		}
-	protected:
-		inline void gpu_memset(Dtype* data, Dtype value, int count) {
-			ocl_memset(data, value, count);
-		}
+    inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
+      im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, col_buff, 0);
+    }
+    inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
+      col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, data, bottom_offset_);
+    }
+  protected:
+    inline void conv_im2col_gpu_opt(const Dtype* data) {
+      im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0,
+          opt_num2);
+    }
+    inline void conv_col2im_gpu_opt(Dtype* data) {
+      col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_,
+          opt_num2);
+    }
+  private:
+    inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
+      transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_,
+          M_ * opt_num2, opt_num2);
+    }
+    inline void conv_transpose_gpu(const Dtype* data) {
+      opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+          opt_num2);
+    }
+  protected:
+    inline void gpu_memset(Dtype* data, Dtype value, int count) {
+      ocl_memset(data, value, count);
+    }
 #endif
 
-	private:
-		int conv_out_channels_;
-		int conv_in_channels_;
-		int conv_out_spatial_dim_;
-		int conv_in_height_;
-		int conv_in_width_;
-		int kernel_dim_;
+  private:
+    int conv_out_channels_;
+    int conv_in_channels_;
+    int conv_out_spatial_dim_;
+    int conv_in_height_;
+    int conv_in_width_;
+    int kernel_dim_;
 
-		Blob<Dtype> col_buffer_;
-		Blob<Dtype> bias_multiplier_;
+    Blob<Dtype> col_buffer_;
+    Blob<Dtype> bias_multiplier_;
 
 //opencl related data structures
-	protected:
-		int opt_num2;
-		int M_, N_, K_;
-		int weight_offset_;
-		int col_offset_;
-		int output_offset_;
-		int top_offset_, top_offset_opt, bottom_offset_;
-		public:
-		static cl_mem subTopMem, transMem;
-		static size_t subtop_mem_size, trans_mem_size;
+  protected:
+    int opt_num2;
+    int M_, N_, K_;
+    int weight_offset_;
+    int col_offset_;
+    int output_offset_;
+    int top_offset_, top_offset_opt, bottom_offset_;
+  public:
+    static cl_mem subTopMem, transMem;
+    static size_t subtop_mem_size, trans_mem_size;
 };
 
 /**
@@ -185,66 +182,65 @@ class BaseConvolutionLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
-	public:
-		/**
-		 * @param param provides ConvolutionParameter convolution_param,
-		 *    with ConvolutionLayer options:
-		 *  - num_output. The number of filters.
-		 *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
-		 *  kernel_size for square filters or kernel_h and kernel_w for rectangular
-		 *  filters.
-		 *  - stride / stride_h / stride_w (\b optional, default 1). The filter
-		 *  stride, given by stride_size for equal dimensions or stride_h and stride_w
-		 *  for different strides. By default the convolution is dense with stride 1.
-		 *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
-		 *  convolution, given by pad for equal dimensions or pad_h and pad_w for
-		 *  different padding. Input padding is computed implicitly instead of
-		 *  actually padding.
-		 *  - group (\b optional, default 1). The number of filter groups. Group
-		 *  convolution is a method for reducing parameterization by selectively
-		 *  connecting input and output channels. The input and output channel dimensions must be divisible
-		 *  by the number of groups. For group @f$ \geq 1 @f$, the
-		 *  convolutional filters' input and output channels are separated s.t. each
-		 *  group takes 1 / group of the input channels and makes 1 / group of the
-		 *  output channels. Concretely 4 input channels, 8 output channels, and
-		 *  2 groups separate input channels 1-2 and output channels 1-4 into the
-		 *  first group and input channels 3-4 and output channels 5-8 into the second
-		 *  group.
-		 *  - bias_term (\b optional, default true). Whether to have a bias.
-		 *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
-		 *    kernels + stream parallelism) engines.
-		 */
-		explicit ConvolutionLayer(const LayerParameter& param)
-		:
-				BaseConvolutionLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "Convolution";
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual inline bool reverse_dimensions() {
-			return false;
-		}
-		virtual void compute_output_shape();
-
-		virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  public:
+    /**
+     * @param param provides ConvolutionParameter convolution_param,
+     *    with ConvolutionLayer options:
+     *  - num_output. The number of filters.
+     *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+     *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+     *  filters.
+     *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+     *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+     *  for different strides. By default the convolution is dense with stride 1.
+     *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+     *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+     *  different padding. Input padding is computed implicitly instead of
+     *  actually padding.
+     *  - group (\b optional, default 1). The number of filter groups. Group
+     *  convolution is a method for reducing parameterization by selectively
+     *  connecting input and output channels. The input and output channel dimensions must be divisible
+     *  by the number of groups. For group @f$ \geq 1 @f$, the
+     *  convolutional filters' input and output channels are separated s.t. each
+     *  group takes 1 / group of the input channels and makes 1 / group of the
+     *  output channels. Concretely 4 input channels, 8 output channels, and
+     *  2 groups separate input channels 1-2 and output channels 1-4 into the
+     *  first group and input channels 3-4 and output channels 5-8 into the second
+     *  group.
+     *  - bias_term (\b optional, default true). Whether to have a bias.
+     *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+     *    kernels + stream parallelism) engines.
+     */
+    explicit ConvolutionLayer(const LayerParameter& param)
+        : BaseConvolutionLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Convolution";
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual inline bool reverse_dimensions() {
+      return false;
+    }
+    virtual void compute_output_shape();
+
+    virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -263,29 +259,28 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
  */
 template <typename Dtype>
 class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
-	public:
-		explicit DeconvolutionLayer(const LayerParameter& param)
-		:
-				BaseConvolutionLayer<Dtype>(param) {
-		}
-
-		virtual inline const char* type() const {
-			return "Deconvolution";
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual inline bool reverse_dimensions() {
-			return true;
-		}
-		virtual void compute_output_shape();
+  public:
+    explicit DeconvolutionLayer(const LayerParameter& param)
+        : BaseConvolutionLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Deconvolution";
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual inline bool reverse_dimensions() {
+      return true;
+    }
+    virtual void compute_output_shape();
 };
 
 #ifdef USE_CUDNN
@@ -305,31 +300,31 @@ class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
-	public:
-	explicit CuDNNConvolutionLayer(const LayerParameter& param)
-	: ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNConvolutionLayer();
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t* handle_;
-	cudaStream_t* stream_;
-	vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
-	cudnnTensorDescriptor_t bias_desc_;
-	cudnnFilterDescriptor_t filter_desc_;
-	vector<cudnnConvolutionDescriptor_t> conv_descs_;
-	int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
-	size_t workspaceSizeInBytes;
-	void *workspace;
+  public:
+  explicit CuDNNConvolutionLayer(const LayerParameter& param)
+  : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNConvolutionLayer();
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t* stream_;
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
+  size_t workspaceSizeInBytes;
+  void *workspace;
 };
 #endif
 
@@ -342,41 +337,40 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  */
 template <typename Dtype>
 class Im2colLayer: public Layer<Dtype> {
-	public:
-		explicit Im2colLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Im2col";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int kernel_h_, kernel_w_;
-		int stride_h_, stride_w_;
-		int channels_;
-		int height_, width_;
-		int pad_h_, pad_w_;
+  public:
+    explicit Im2colLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Im2col";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int channels_;
+    int height_, width_;
+    int pad_h_, pad_w_;
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
@@ -390,80 +384,79 @@ template <typename Dtype> class SplitLayer;
  */
 template <typename Dtype>
 class LRNLayer: public Layer<Dtype> {
-	public:
-		explicit LRNLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "LRN";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int ExactNumTopBlobs() const {
-			return 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int size_;
-		int pre_pad_;
-		Dtype alpha_;
-		Dtype beta_;
-		Dtype k_;
-		int num_;
-		int channels_;
-		int height_;
-		int width_;
-
-		// Fields used for normalization ACROSS_CHANNELS
-		// scale_ stores the intermediate summing results
-		Blob<Dtype> scale_;
-
-		// Fields used for normalization WITHIN_CHANNEL
-		shared_ptr<SplitLayer<Dtype> > split_layer_;
-		vector<Blob<Dtype>*> split_top_vec_;
-		shared_ptr<PowerLayer<Dtype> > square_layer_;
-		Blob<Dtype> square_input_;
-		Blob<Dtype> square_output_;
-		vector<Blob<Dtype>*> square_bottom_vec_;
-		vector<Blob<Dtype>*> square_top_vec_;
-		shared_ptr<PoolingLayer<Dtype> > pool_layer_;
-		Blob<Dtype> pool_output_;
-		vector<Blob<Dtype>*> pool_top_vec_;
-		shared_ptr<PowerLayer<Dtype> > power_layer_;
-		Blob<Dtype> power_output_;
-		vector<Blob<Dtype>*> power_top_vec_;
-		shared_ptr<EltwiseLayer<Dtype> > product_layer_;
-		Blob<Dtype> product_input_;
-		vector<Blob<Dtype>*> product_bottom_vec_;
+  public:
+    explicit LRNLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "LRN";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int size_;
+    int pre_pad_;
+    Dtype alpha_;
+    Dtype beta_;
+    Dtype k_;
+    int num_;
+    int channels_;
+    int height_;
+    int width_;
+
+    // Fields used for normalization ACROSS_CHANNELS
+    // scale_ stores the intermediate summing results
+    Blob<Dtype> scale_;
+
+    // Fields used for normalization WITHIN_CHANNEL
+    shared_ptr<SplitLayer<Dtype> > split_layer_;
+    vector<Blob<Dtype>*> split_top_vec_;
+    shared_ptr<PowerLayer<Dtype> > square_layer_;
+    Blob<Dtype> square_input_;
+    Blob<Dtype> square_output_;
+    vector<Blob<Dtype>*> square_bottom_vec_;
+    vector<Blob<Dtype>*> square_top_vec_;
+    shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+    Blob<Dtype> pool_output_;
+    vector<Blob<Dtype>*> pool_top_vec_;
+    shared_ptr<PowerLayer<Dtype> > power_layer_;
+    Blob<Dtype> power_output_;
+    vector<Blob<Dtype>*> power_top_vec_;
+    shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+    Blob<Dtype> product_input_;
+    vector<Blob<Dtype>*> product_bottom_vec_;
 
 };
 
@@ -474,51 +467,51 @@ class LRNLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class PoolingLayer: public Layer<Dtype> {
-	public:
-		explicit PoolingLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "Pooling";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-		// MAX POOL layers can output an extra top blob for the mask;
-		// others can only output the pooled inputs.
-		virtual inline int MaxTopBlobs() const {
-			return (this->layer_param_.pooling_param().pool() ==
-					PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-		int kernel_h_, kernel_w_;
-		int stride_h_, stride_w_;
-		int pad_h_, pad_w_;
-		int channels_;
-		int height_, width_;
-		int pooled_height_, pooled_width_;
-		bool global_pooling_;
-		Blob<Dtype> rand_idx_;
-		Blob<int> max_idx_;
+  public:
+    explicit PoolingLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Pooling";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    // MAX POOL layers can output an extra top blob for the mask;
+    // others can only output the pooled inputs.
+    virtual inline int MaxTopBlobs() const {
+      return
+          (this->layer_param_.pooling_param().pool()
+              == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int pad_h_, pad_w_;
+    int channels_;
+    int height_, width_;
+    int pooled_height_, pooled_width_;
+    bool global_pooling_;
+    Blob<Dtype> rand_idx_;
+    Blob<int> max_idx_;
 
 };
 
@@ -529,29 +522,29 @@ class PoolingLayer: public Layer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
-	public:
-	explicit CuDNNPoolingLayer(const LayerParameter& param)
-	: PoolingLayer<Dtype>(param), handles_setup_(false) {}
-	virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual ~CuDNNPoolingLayer();
-	// Currently, cuDNN does not support the extra top blob.
-	virtual inline int MinTopBlobs() const {return -1;}
-	virtual inline int ExactNumTopBlobs() const {return 1;}
-
-	protected:
-	virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-			const vector<Blob<Dtype>*>& top);
-	virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-			const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-	bool handles_setup_;
-	cudnnHandle_t handle_;
-	cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-	cudnnPoolingDescriptor_t pooling_desc_;
-	cudnnPoolingMode_t mode_;
+  public:
+  explicit CuDNNPoolingLayer(const LayerParameter& param)
+  : PoolingLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNPoolingLayer();
+  // Currently, cuDNN does not support the extra top blob.
+  virtual inline int MinTopBlobs() const {return -1;}
+  virtual inline int ExactNumTopBlobs() const {return 1;}
+
+  protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+  cudnnPoolingMode_t mode_;
 };
 #endif
 
@@ -563,70 +556,70 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
  */
 template <typename Dtype>
 class SPPLayer: public Layer<Dtype> {
-	public:
-		explicit SPPLayer(const LayerParameter& param)
-		:
-				Layer<Dtype>(param) {
-		}
-		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-
-		virtual inline const char* type() const {
-			return "SPP";
-		}
-		virtual inline int ExactNumBottomBlobs() const {
-			return 1;
-		}
-		virtual inline int MinTopBlobs() const {
-			return 1;
-		}
-		// MAX POOL layers can output an extra top blob for the mask;
-		// others can only output the pooled inputs.
-		virtual inline int MaxTopBlobs() const {
-			return (this->layer_param_.pooling_param().pool() ==
-					PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-		}
-
-	protected:
-		virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-				const vector<Blob<Dtype>*>& top);
-		virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-				const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-		// calculates the kernel and stride dimensions for the pooling layer,
-		// returns a correctly configured LayerParameter for a PoolingLayer
-		virtual LayerParameter GetPoolingParam(const int pyramid_level,
-				const int bottom_h, const int bottom_w, const SPPParameter spp_param);
-
-		int pyramid_height_;
-		int bottom_h_, bottom_w_;
-		int channels_;
-		int kernel_h_, kernel_w_;
-		int pad_h_, pad_w_;
-
-		/// the internal Split layer that feeds the pooling layers
-		shared_ptr<SplitLayer<Dtype> > split_layer_;
-		/// top vector holder used in call to the underlying SplitLayer::Forward
-		vector<Blob<Dtype>*> split_top_vec_;
-		/// bottom vector holder used in call to the underlying PoolingLayer::Forward
-		vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
-		/// the internal Pooling layers of different kernel sizes
-		vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
-		/// top vector holders used in call to the underlying PoolingLayer::Forward
-		vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
-		/// pooling_outputs stores the outputs of the PoolingLayers
-		vector<Blob<Dtype>*> pooling_outputs_;
-		/// the internal Flatten layers that the Pooling layers feed into
-		vector<FlattenLayer<Dtype>*> flatten_layers_;
-		/// top vector holders used in call to the underlying FlattenLayer::Forward
-		vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
-		/// flatten_outputs stores the outputs of the FlattenLayers
-		vector<Blob<Dtype>*> flatten_outputs_;
-		/// bottom vector holder used in call to the underlying ConcatLayer::Forward
-		vector<Blob<Dtype>*> concat_bottom_vec_;
-		/// the internal Concat layers that the Flatten layers feed into
-		shared_ptr<ConcatLayer<Dtype> > concat_layer_;
+  public:
+    explicit SPPLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SPP";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    // MAX POOL layers can output an extra top blob for the mask;
+    // others can only output the pooled inputs.
+    virtual inline int MaxTopBlobs() const {
+      return
+          (this->layer_param_.pooling_param().pool()
+              == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    // calculates the kernel and stride dimensions for the pooling layer,
+    // returns a correctly configured LayerParameter for a PoolingLayer
+    virtual LayerParameter GetPoolingParam(const int pyramid_level,
+        const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+
+    int pyramid_height_;
+    int bottom_h_, bottom_w_;
+    int channels_;
+    int kernel_h_, kernel_w_;
+    int pad_h_, pad_w_;
+
+    /// the internal Split layer that feeds the pooling layers
+    shared_ptr<SplitLayer<Dtype> > split_layer_;
+    /// top vector holder used in call to the underlying SplitLayer::Forward
+    vector<Blob<Dtype>*> split_top_vec_;
+    /// bottom vector holder used in call to the underlying PoolingLayer::Forward
+    vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
+    /// the internal Pooling layers of different kernel sizes
+    vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
+    /// top vector holders used in call to the underlying PoolingLayer::Forward
+    vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
+    /// pooling_outputs stores the outputs of the PoolingLayers
+    vector<Blob<Dtype>*> pooling_outputs_;
+    /// the internal Flatten layers that the Pooling layers feed into
+    vector<FlattenLayer<Dtype>*> flatten_layers_;
+    /// top vector holders used in call to the underlying FlattenLayer::Forward
+    vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
+    /// flatten_outputs stores the outputs of the FlattenLayers
+    vector<Blob<Dtype>*> flatten_outputs_;
+    /// bottom vector holder used in call to the underlying ConcatLayer::Forward
+    vector<Blob<Dtype>*> concat_bottom_vec_;
+    /// the internal Concat layers that the Flatten layers feed into
+    shared_ptr<ConcatLayer<Dtype> > concat_layer_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 5e327c67..089899fc 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -10,505 +10,497 @@ namespace caffe {
 
 template <typename Dtype>
 void Blob<Dtype>::Reshape(const int num, const int channels, const int height,
-		const int width) {
-	vector<int> shape(4);
-	shape[0] = num;
-	shape[1] = channels;
-	shape[2] = height;
-	shape[3] = width;
-	Reshape(shape);
+    const int width) {
+  vector<int> shape(4);
+  shape[0] = num;
+  shape[1] = channels;
+  shape[2] = height;
+  shape[3] = width;
+  Reshape(shape);
 }
 
 template <typename Dtype>
 void Blob<Dtype>::Reshape(const vector<int>& shape) {
-	CHECK_LE(shape.size(), kMaxBlobAxes);
-	count_ = 1;
-	shape_.resize(shape.size());
-	for (int i = 0; i < shape.size(); ++i) {
-		CHECK_GE(shape[i], 0);
-		CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
-		count_ *= shape[i];
-		shape_[i] = shape[i];
-	}
-	if (count_ > capacity_) {
-		capacity_ = count_;
-		data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
-		diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
-	}
+  CHECK_LE(shape.size(), kMaxBlobAxes);
+  count_ = 1;
+  shape_.resize(shape.size());
+  for (int i = 0; i < shape.size(); ++i) {
+    CHECK_GE(shape[i], 0);
+    CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+    count_ *= shape[i];
+    shape_[i] = shape[i];
+  }
+  if (count_ > capacity_) {
+    capacity_ = count_;
+    data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
+    diff_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
+  }
 }
 
 template <typename Dtype>
 void Blob<Dtype>::Reshape(const BlobShape& shape) {
-	CHECK_LE(shape.dim_size(), kMaxBlobAxes);
-	vector<int> shape_vec(shape.dim_size());
-	for (int i = 0; i < shape.dim_size(); ++i) {
-		shape_vec[i] = shape.dim(i);
-	}
-	Reshape(shape_vec);
+  CHECK_LE(shape.dim_size(), kMaxBlobAxes);
+  vector<int> shape_vec(shape.dim_size());
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    shape_vec[i] = shape.dim(i);
+  }
+  Reshape(shape_vec);
 }
 
 template <typename Dtype>
 void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
-	Reshape(other.shape());
+  Reshape(other.shape());
 }
 
 template <typename Dtype>
 Blob<Dtype>::Blob(const int num, const int channels, const int height,
-		const int width)
-// capacity_ must be initialized before calling Reshape
-:
-		capacity_(0) {
-	Reshape(num, channels, height, width);
+    const int width)
+    : capacity_(0) {
+  Reshape(num, channels, height, width);
 }
 
 template <typename Dtype>
 Blob<Dtype>::Blob(const vector<int>& shape)
-// capacity_ must be initialized before calling Reshape
-:
-		capacity_(0) {
-	Reshape(shape);
+    : capacity_(0) {
+  Reshape(shape);
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
-	CHECK (data_);
-	return (const Dtype*) data_->cpu_data();
+  CHECK (data_);
+  return (const Dtype*) data_->cpu_data();
 }
 
 template <typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
-	CHECK(data);
-	data_->set_cpu_data(data);
+  CHECK(data);
+  data_->set_cpu_data(data);
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_data() const {
-	CHECK (data_);
-	return (const Dtype*) data_->gpu_data();
+  CHECK (data_);
+  return (const Dtype*) data_->gpu_data();
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_cache_data() const {
-	CHECK (data_);
-	return (const Dtype*) data_->gpu_cache_data();
+  CHECK (data_);
+  return (const Dtype*) data_->gpu_cache_data();
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
-	CHECK (diff_);
-	return (const Dtype*) diff_->cpu_data();
+  CHECK (diff_);
+  return (const Dtype*) diff_->cpu_data();
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_diff() const {
-	CHECK (diff_);
-	return (const Dtype*) diff_->gpu_data();
+  CHECK (diff_);
+  return (const Dtype*) diff_->gpu_data();
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_data() {
-	CHECK (data_);
-	return static_cast<Dtype*>(data_->mutable_cpu_data());
+  CHECK (data_);
+  return static_cast<Dtype*>(data_->mutable_cpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_data() {
-	CHECK (data_);
-	return static_cast<Dtype*>(data_->mutable_gpu_data());
+  CHECK (data_);
+  return static_cast<Dtype*>(data_->mutable_gpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_diff() {
-	CHECK (diff_);
-	return static_cast<Dtype*>(diff_->mutable_cpu_data());
+  CHECK (diff_);
+  return static_cast<Dtype*>(diff_->mutable_cpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_diff() {
-	CHECK (diff_);
-	return static_cast<Dtype*>(diff_->mutable_gpu_data());
+  CHECK (diff_);
+  return static_cast<Dtype*>(diff_->mutable_gpu_data());
 }
 
 template <typename Dtype>
 void Blob<Dtype>::ShareData(const Blob& other) {
-	CHECK_EQ(count_, other.count());
-	data_ = other.data();
+  CHECK_EQ(count_, other.count());
+  data_ = other.data();
 }
 
 template <typename Dtype>
 void Blob<Dtype>::ShareDiff(const Blob& other) {
-	CHECK_EQ(count_, other.count());
-	diff_ = other.diff();
+  CHECK_EQ(count_, other.count());
+  diff_ = other.diff();
 }
 
 // The "update" method is used for parameter blobs in a Net, which are stored
 // as Blob<float> or Blob<double> -- hence we do not define it for
 // Blob<int> or Blob<unsigned int>.
 template <> void Blob<unsigned int>::Update() {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 template <> void Blob<int>::Update() {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 
 template <typename Dtype>
 void Blob<Dtype>::Update() {
-	// We will perform update based on where the data is located.
-	switch (data_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			// perform computation on CPU
-			caffe_axpy < Dtype > (count_, Dtype(-1),
-					static_cast<const Dtype*>(diff_->cpu_data()),
-					static_cast<Dtype*>(data_->mutable_cpu_data()));
-			break;
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-			// perform computation on GPU
-			caffe_gpu_axpy < Dtype > (count_, Dtype(-1),
-					static_cast<const Dtype*>(diff_->gpu_data()),
-					static_cast<Dtype*>(data_->mutable_gpu_data()));
+  // We will perform update based on where the data is located.
+  switch (data_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    // perform computation on CPU
+    caffe_axpy < Dtype
+        > (count_, Dtype(-1), static_cast<const Dtype*>(diff_->cpu_data()), static_cast<Dtype*>(data_->mutable_cpu_data()));
+    break;
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+    // perform computation on GPU
+    caffe_gpu_axpy < Dtype
+        > (count_, Dtype(-1), static_cast<const Dtype*>(diff_->gpu_data()), static_cast<Dtype*>(data_->mutable_gpu_data()));
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		default:
-			LOG(FATAL) << "Syncedmem not initialized.";
-	}
+    break;
+  default:
+    LOG(FATAL) << "Syncedmem not initialized.";
+  }
 }
 
 template <> unsigned int Blob<unsigned int>::asum_data() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <> int Blob<int>::asum_data() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <typename Dtype>
 Dtype Blob<Dtype>::asum_data() const {
-	if (!data_) {
-		return 0;
-	}
-	switch (data_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			return caffe_cpu_asum(count_, cpu_data());
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-		{
-			Dtype asum;
-			caffe_gpu_asum(count_, gpu_data(), &asum);
-			return asum;
-		}
+  if (!data_) {
+    return 0;
+  }
+  switch (data_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    return caffe_cpu_asum(count_, cpu_data());
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+  {
+    Dtype asum;
+    caffe_gpu_asum(count_, gpu_data(), &asum);
+    return asum;
+  }
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-		case SyncedMemory::UNINITIALIZED:
-			return 0;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-	}
-	return 0;
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+  }
+  return 0;
 }
 
 template <> unsigned int Blob<unsigned int>::asum_diff() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <> int Blob<int>::asum_diff() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <typename Dtype>
 Dtype Blob<Dtype>::asum_diff() const {
-	if (!diff_) {
-		return 0;
-	}
-	switch (diff_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			return caffe_cpu_asum(count_, cpu_diff());
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-		{
-			Dtype asum;
-			caffe_gpu_asum(count_, gpu_diff(), &asum);
-			return asum;
-		}
+  if (!diff_) {
+    return 0;
+  }
+  switch (diff_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    return caffe_cpu_asum(count_, cpu_diff());
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+  {
+    Dtype asum;
+    caffe_gpu_asum(count_, gpu_diff(), &asum);
+    return asum;
+  }
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-		case SyncedMemory::UNINITIALIZED:
-			return 0;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
-	}
-	return 0;
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
+  }
+  return 0;
 }
 
 template <> unsigned int Blob<unsigned int>::sumsq_data() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <> int Blob<int>::sumsq_data() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_data() const {
-	Dtype sumsq;
-	const Dtype* data;
-	if (!data_) {
-		return 0;
-	}
-	switch (data_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			data = cpu_data();
-			sumsq = caffe_cpu_dot(count_, data, data);
-			break;
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-			data = gpu_data();
-			caffe_gpu_dot(count_, data, data, &sumsq);
+  Dtype sumsq;
+  const Dtype* data;
+  if (!data_) {
+    return 0;
+  }
+  switch (data_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    data = cpu_data();
+    sumsq = caffe_cpu_dot(count_, data, data);
+    break;
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+    data = gpu_data();
+    caffe_gpu_dot(count_, data, data, &sumsq);
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		case SyncedMemory::UNINITIALIZED:
-			return 0;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-	}
-	return sumsq;
+    break;
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+  }
+  return sumsq;
 }
 
 template <> unsigned int Blob<unsigned int>::sumsq_diff() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <> int Blob<int>::sumsq_diff() const {
-	NOT_IMPLEMENTED;
-	return 0;
+  NOT_IMPLEMENTED;
+  return 0;
 }
 
 template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_diff() const {
-	Dtype sumsq;
-	const Dtype* diff;
-	if (!diff_) {
-		return 0;
-	}
-	switch (diff_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			diff = cpu_diff();
-			sumsq = caffe_cpu_dot(count_, diff, diff);
-			break;
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-			diff = gpu_diff();
-			caffe_gpu_dot(count_, diff, diff, &sumsq);
-			break;
+  Dtype sumsq;
+  const Dtype* diff;
+  if (!diff_) {
+    return 0;
+  }
+  switch (diff_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    diff = cpu_diff();
+    sumsq = caffe_cpu_dot(count_, diff, diff);
+    break;
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+    diff = gpu_diff();
+    caffe_gpu_dot(count_, diff, diff, &sumsq);
+    break;
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-		case SyncedMemory::UNINITIALIZED:
-			return 0;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-	}
-	return sumsq;
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+  }
+  return sumsq;
 }
 
 template <> void Blob<unsigned int>::scale_data(unsigned int scale_factor) {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 
 template <> void Blob<int>::scale_data(int scale_factor) {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 
 template <typename Dtype>
 void Blob<Dtype>::scale_data(Dtype scale_factor) {
-	Dtype* data;
-	if (!data_) {
-		return;
-	}
-	switch (data_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			data = mutable_cpu_data();
-			caffe_scal(count_, scale_factor, data);
-			return;
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-			data = mutable_gpu_data();
-			caffe_gpu_scal(count_, scale_factor, data);
-			return;
+  Dtype* data;
+  if (!data_) {
+    return;
+  }
+  switch (data_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    data = mutable_cpu_data();
+    caffe_scal(count_, scale_factor, data);
+    return;
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+    data = mutable_gpu_data();
+    caffe_gpu_scal(count_, scale_factor, data);
+    return;
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-		case SyncedMemory::UNINITIALIZED:
-			return;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
-	}
+  case SyncedMemory::UNINITIALIZED:
+    return;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+  }
 }
 
 template <> void Blob<unsigned int>::scale_diff(unsigned int scale_factor) {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 
 template <> void Blob<int>::scale_diff(int scale_factor) {
-	NOT_IMPLEMENTED;
+  NOT_IMPLEMENTED;
 }
 
 template <typename Dtype>
 void Blob<Dtype>::scale_diff(Dtype scale_factor) {
-	Dtype* diff;
-	if (!diff_) {
-		return;
-	}
-	switch (diff_->head()) {
-		case SyncedMemory::HEAD_AT_CPU:
-			diff = mutable_cpu_diff();
-			caffe_scal(count_, scale_factor, diff);
-			return;
-		case SyncedMemory::HEAD_AT_GPU:
-			case SyncedMemory::SYNCED:
-			#ifndef CPU_ONLY
-			diff = mutable_gpu_diff();
-			caffe_gpu_scal(count_, scale_factor, diff);
-			return;
+  Dtype* diff;
+  if (!diff_) {
+    return;
+  }
+  switch (diff_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    diff = mutable_cpu_diff();
+    caffe_scal(count_, scale_factor, diff);
+    return;
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+    diff = mutable_gpu_diff();
+    caffe_gpu_scal(count_, scale_factor, diff);
+    return;
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-		case SyncedMemory::UNINITIALIZED:
-			return;
-		default:
-			LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
-	}
+  case SyncedMemory::UNINITIALIZED:
+    return;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
+  }
 }
 
 template <typename Dtype>
 bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
-	if (other.has_num() || other.has_channels() ||
-			other.has_height() || other.has_width()) {
-		// Using deprecated 4D Blob dimensions --
-		// shape is (num, channels, height, width).
-		// Note: we do not use the normal Blob::num(), Blob::channels(), etc.
-		// methods as these index from the beginning of the blob shape, where legacy
-		// parameter blobs were indexed from the end of the blob shape (e.g., bias
-		// Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
-		return shape_.size() <= 4 &&
-				LegacyShape(-4) == other.num() &&
-				LegacyShape(-3) == other.channels() &&
-				LegacyShape(-2) == other.height() &&
-				LegacyShape(-1) == other.width();
-	}
-	vector<int> other_shape(other.shape().dim_size());
-	for (int i = 0; i < other.shape().dim_size(); ++i) {
-		other_shape[i] = other.shape().dim(i);
-	}
-	return shape_ == other_shape;
+  if (other.has_num() || other.has_channels() || other.has_height()
+      || other.has_width()) {
+    // Using deprecated 4D Blob dimensions --
+    // shape is (num, channels, height, width).
+    // Note: we do not use the normal Blob::num(), Blob::channels(), etc.
+    // methods as these index from the beginning of the blob shape, where legacy
+    // parameter blobs were indexed from the end of the blob shape (e.g., bias
+    // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
+    return shape_.size() <= 4 && LegacyShape(-4) == other.num()
+        && LegacyShape(-3) == other.channels()
+        && LegacyShape(-2) == other.height() && LegacyShape(-1) == other.width();
+  }
+  vector<int> other_shape(other.shape().dim_size());
+  for (int i = 0; i < other.shape().dim_size(); ++i) {
+    other_shape[i] = other.shape().dim(i);
+  }
+  return shape_ == other_shape;
 }
 
 template <typename Dtype>
 void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
-	if (source.count() != count_ || source.shape() != shape_) {
-		if (reshape) {
-			ReshapeLike(source);
-		} else {
-			LOG(FATAL) << "Trying to copy blobs of different sizes.";
-		}
-	}
-	switch (Caffe::mode()) {
-		case Caffe::GPU:
-			if (copy_diff) {
-				caffe_copy(count_, source.gpu_diff(),
-						static_cast<Dtype*>(diff_->mutable_gpu_data()));
-			} else {
-				caffe_copy(count_, source.gpu_data(),
-						static_cast<Dtype*>(data_->mutable_gpu_data()));
-			}
-			break;
-		case Caffe::CPU:
-			if (copy_diff) {
-				caffe_copy(count_, source.cpu_diff(),
-						static_cast<Dtype*>(diff_->mutable_cpu_data()));
-			} else {
-				caffe_copy(count_, source.cpu_data(),
-						static_cast<Dtype*>(data_->mutable_cpu_data()));
-			}
-			break;
-		default:
-			LOG(FATAL) << "Unknown caffe mode.";
-	}
+  if (source.count() != count_ || source.shape() != shape_) {
+    if (reshape) {
+      ReshapeLike(source);
+    } else {
+      LOG(FATAL) << "Trying to copy blobs of different sizes.";
+    }
+  }
+  switch (Caffe::mode()) {
+  case Caffe::GPU:
+    if (copy_diff) {
+      caffe_copy(count_, source.gpu_diff(),
+          static_cast<Dtype*>(diff_->mutable_gpu_data()));
+    } else {
+      caffe_copy(count_, source.gpu_data(),
+          static_cast<Dtype*>(data_->mutable_gpu_data()));
+    }
+    break;
+  case Caffe::CPU:
+    if (copy_diff) {
+      caffe_copy(count_, source.cpu_diff(),
+          static_cast<Dtype*>(diff_->mutable_cpu_data()));
+    } else {
+      caffe_copy(count_, source.cpu_data(),
+          static_cast<Dtype*>(data_->mutable_cpu_data()));
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
 }
 
 template <typename Dtype>
 void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
-	if (reshape) {
-		vector<int> shape;
-		if (proto.has_num() || proto.has_channels() ||
-				proto.has_height() || proto.has_width()) {
-			// Using deprecated 4D Blob dimensions --
-			// shape is (num, channels, height, width).
-			shape.resize(4);
-			shape[0] = proto.num();
-			shape[1] = proto.channels();
-			shape[2] = proto.height();
-			shape[3] = proto.width();
-		} else {
-			shape.resize(proto.shape().dim_size());
-			for (int i = 0; i < proto.shape().dim_size(); ++i) {
-				shape[i] = proto.shape().dim(i);
-			}
-		}
-		Reshape(shape);
-	} else {
-		CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";
-	}
-	// copy data
-	Dtype* data_vec = mutable_cpu_data();
-	for (int i = 0; i < count_; ++i) {
-		data_vec[i] = proto.data(i);
-	}
-	if (proto.diff_size() > 0) {
-		Dtype* diff_vec = mutable_cpu_diff();
-		for (int i = 0; i < count_; ++i) {
-			diff_vec[i] = proto.diff(i);
-		}
-	}
+  if (reshape) {
+    vector<int> shape;
+    if (proto.has_num() || proto.has_channels() || proto.has_height()
+        || proto.has_width()) {
+      // Using deprecated 4D Blob dimensions --
+      // shape is (num, channels, height, width).
+      shape.resize(4);
+      shape[0] = proto.num();
+      shape[1] = proto.channels();
+      shape[2] = proto.height();
+      shape[3] = proto.width();
+    } else {
+      shape.resize(proto.shape().dim_size());
+      for (int i = 0; i < proto.shape().dim_size(); ++i) {
+        shape[i] = proto.shape().dim(i);
+      }
+    }
+    Reshape(shape);
+  } else {
+    CHECK(ShapeEquals(proto)) << "shape mismatch (reshape not set)";
+  }
+  // copy data
+  Dtype* data_vec = mutable_cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    data_vec[i] = proto.data(i);
+  }
+  if (proto.diff_size() > 0) {
+    Dtype* diff_vec = mutable_cpu_diff();
+    for (int i = 0; i < count_; ++i) {
+      diff_vec[i] = proto.diff(i);
+    }
+  }
 }
 
 template <typename Dtype>
 void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
-	proto->clear_shape();
-	for (int i = 0; i < shape_.size(); ++i) {
-		proto->mutable_shape()->add_dim(shape_[i]);
-	}
-	proto->clear_data();
-	proto->clear_diff();
-	const Dtype* data_vec = cpu_data();
-	for (int i = 0; i < count_; ++i) {
-		proto->add_data(data_vec[i]);
-	}
-	if (write_diff) {
-		const Dtype* diff_vec = cpu_diff();
-		for (int i = 0; i < count_; ++i) {
-			proto->add_diff(diff_vec[i]);
-		}
-	}
+  proto->clear_shape();
+  for (int i = 0; i < shape_.size(); ++i) {
+    proto->mutable_shape()->add_dim(shape_[i]);
+  }
+  proto->clear_data();
+  proto->clear_diff();
+  const Dtype* data_vec = cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    proto->add_data(data_vec[i]);
+  }
+  if (write_diff) {
+    const Dtype* diff_vec = cpu_diff();
+    for (int i = 0; i < count_; ++i) {
+      proto->add_diff(diff_vec[i]);
+    }
+  }
 }
 
 INSTANTIATE_CLASS (Blob);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 2157c96a..2698ffee 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -11,36 +11,36 @@ shared_ptr<Caffe> Caffe::singleton_;
 
 // random seeding
 int64_t cluster_seedgen(void) {
-	//To fix: for now we use fixed seed to get same result each time
-	/*
-	 int64_t s, seed, pid;
-	 FILE* f = fopen("/dev/urandom", "rb");
-	 if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-	 fclose(f);
-	 return seed;
-	 }
-
-	 LOG(INFO) << "System entropy source not available, "
-	 "using fallback algorithm to generate seed instead.";
-	 if (f)
-	 fclose(f);
-
-	 pid = getpid();
-	 s = time(NULL);
-	 seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-	 //return seed;
-	 LOG(WARNING) << "return fixed seed 37";
-	 */
-	return 37;
+  //To fix: for now we use fixed seed to get same result each time
+  /*
+   int64_t s, seed, pid;
+   FILE* f = fopen("/dev/urandom", "rb");
+   if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+   fclose(f);
+   return seed;
+   }
+
+   LOG(INFO) << "System entropy source not available, "
+   "using fallback algorithm to generate seed instead.";
+   if (f)
+   fclose(f);
+
+   pid = getpid();
+   s = time(NULL);
+   seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+   //return seed;
+   LOG(WARNING) << "return fixed seed 37";
+   */
+  return 37;
 }
 
 void GlobalInit(int* pargc, char*** pargv) {
-	// Google flags.
-	::gflags::ParseCommandLineFlags(pargc, pargv, true);
-	// Google logging.
-	::google::InitGoogleLogging(*(pargv)[0]);
-	// Provide a backtrace on segfault.
-	::google::InstallFailureSignalHandler();
+  // Google flags.
+  ::gflags::ParseCommandLineFlags(pargc, pargv, true);
+  // Google logging.
+  ::google::InitGoogleLogging(*(pargv)[0]);
+  // Provide a backtrace on segfault.
+  ::google::InstallFailureSignalHandler();
 }
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
@@ -53,25 +53,25 @@ Caffe::~Caffe() {
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-	// RNG seed
-	Get().random_generator_.reset(new RNG(seed));
+  // RNG seed
+  Get().random_generator_.reset(new RNG(seed));
 }
 
 void Caffe::SetDevice(const int device_id) {
-	NO_GPU;
+  NO_GPU;
 }
 
 void Caffe::DeviceQuery() {
-	NO_GPU;
+  NO_GPU;
 }
 
 class Caffe::RNG::Generator {
-	public:
-	Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-	explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-	caffe::rng_t* rng() {return rng_.get();}
-	private:
-	shared_ptr<caffe::rng_t> rng_;
+  public:
+  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
+  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
+  caffe::rng_t* rng() {return rng_.get();}
+  private:
+  shared_ptr<caffe::rng_t> rng_;
 };
 
 Caffe::RNG::RNG() : generator_(new Generator()) {}
@@ -79,79 +79,74 @@ Caffe::RNG::RNG() : generator_(new Generator()) {}
 Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {}
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-	generator_ = other.generator_;
-	return *this;
+  generator_ = other.generator_;
+  return *this;
 }
 
 void* Caffe::RNG::generator() {
-	return static_cast<void*>(generator_->rng());
+  return static_cast<void*>(generator_->rng());
 }
 
 #else  // Normal GPU + CPU Caffe.
 
-Caffe::Caffe()
-{
-	amdDevice.Init();
-	cl_int err = clblasSetup();
-	if (err != CL_SUCCESS) {
-		LOG(ERROR) << "clBLAS setup failed " << err;
-	}
+Caffe::Caffe() {
+  amdDevice.Init();
+  cl_int err = clblasSetup();
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "clBLAS setup failed " << err;
+  }
 }
 
 Caffe::~Caffe() {
-	clblasTeardown();
+  clblasTeardown();
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-	// RNG seed
-	Get().random_generator_.reset(new RNG(seed));
+  // RNG seed
+  Get().random_generator_.reset(new RNG(seed));
 }
 
 void Caffe::SetDevice(const int device_id) {
-	if (amdDevice.GetDevice() == device_id) {
-		return;
-	}
-	amdDevice.Init(device_id);
+  if (amdDevice.GetDevice() == device_id) {
+    return;
+  }
+  amdDevice.Init(device_id);
 }
 
 void Caffe::DeviceQuery() {
-	amdDevice.DeviceQuery();
+  amdDevice.DeviceQuery();
 }
 
 class Caffe::RNG::Generator {
-	public:
-		Generator()
-		:
-				rng_(new caffe::rng_t(cluster_seedgen())) {
-		}
-		explicit Generator(unsigned int seed)
-		:
-				rng_(new caffe::rng_t(seed)) {
-		}
-		caffe::rng_t* rng() {
-			return rng_.get();
-		}
-	private:
-		shared_ptr<caffe::rng_t> rng_;
+  public:
+    Generator()
+        : rng_(new caffe::rng_t(cluster_seedgen())) {
+    }
+    explicit Generator(unsigned int seed)
+        : rng_(new caffe::rng_t(seed)) {
+    }
+    caffe::rng_t* rng() {
+      return rng_.get();
+    }
+  private:
+    shared_ptr<caffe::rng_t> rng_;
 };
 
 Caffe::RNG::RNG()
-:
-		generator_(new Generator()) {
+    : generator_(new Generator()) {
 }
 
 Caffe::RNG::RNG(unsigned int seed)
-:
-		generator_(new Generator(seed)) {
+    : generator_(new Generator(seed)) {
 }
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
-	generator_.reset(other.generator_.get());
-	return *this;
+  generator_.reset(other.generator_.get());
+  return *this;
 }
 
 void* Caffe::RNG::generator() {
-	return static_cast<void*>(generator_->rng());
+  return static_cast<void*>(generator_->rng());
 }
 
 #endif  // CPU_ONLY
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index a041e126..1137bac3 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -12,520 +12,516 @@ namespace caffe {
 
 template <typename Dtype>
 DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
-		Phase phase)
-:
-		param_(param), phase_(phase) {
-	// check if we want to use mean_file
-	if (param_.has_mean_file()) {
-		CHECK_EQ(param_.mean_value_size(), 0) <<
-				"Cannot specify mean_file and mean_value at the same time";
-		const string& mean_file = param.mean_file();
-		LOG(INFO) << "Loading mean file from: " << mean_file;
-		BlobProto blob_proto;
-		ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-		data_mean_.FromProto(blob_proto);
-	}
-	// check if we want to use mean_value
-	if (param_.mean_value_size() > 0) {
-		CHECK(param_.has_mean_file() == false) <<
-				"Cannot specify mean_file and mean_value at the same time";
-		for (int c = 0; c < param_.mean_value_size(); ++c) {
-			mean_values_.push_back(param_.mean_value(c));
-		}
-	}
+    Phase phase)
+    : param_(param), phase_(phase) {
+  // check if we want to use mean_file
+  if (param_.has_mean_file()) {
+    CHECK_EQ(param_.mean_value_size(), 0)
+        << "Cannot specify mean_file and mean_value at the same time";
+    const string& mean_file = param.mean_file();
+    LOG(INFO) << "Loading mean file from: " << mean_file;
+    BlobProto blob_proto;
+    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+    data_mean_.FromProto(blob_proto);
+  }
+  // check if we want to use mean_value
+  if (param_.mean_value_size() > 0) {
+    CHECK(param_.has_mean_file() == false)
+        << "Cannot specify mean_file and mean_value at the same time";
+    for (int c = 0; c < param_.mean_value_size(); ++c) {
+      mean_values_.push_back(param_.mean_value(c));
+    }
+  }
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-		Dtype* transformed_data) {
-	const string& data = datum.data();
-	const int datum_channels = datum.channels();
-	const int datum_height = datum.height();
-	const int datum_width = datum.width();
-
-	const int crop_size = param_.crop_size();
-	const Dtype scale = param_.scale();
-	const bool do_mirror = param_.mirror() && Rand(2);
-	const bool has_mean_file = param_.has_mean_file();
-	const bool has_uint8 = data.size() > 0;
-	const bool has_mean_values = mean_values_.size() > 0;
-
-	CHECK_GT(datum_channels, 0);
-	CHECK_GE(datum_height, crop_size);
-	CHECK_GE(datum_width, crop_size);
-
-	Dtype* mean = NULL;
-	if (has_mean_file) {
-		CHECK_EQ(datum_channels, data_mean_.channels());
-		CHECK_EQ(datum_height, data_mean_.height());
-		CHECK_EQ(datum_width, data_mean_.width());
-		mean = data_mean_.mutable_cpu_data();
-	}
-	if (has_mean_values) {
-		CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
-				"Specify either 1 mean_value or as many as channels: "
-				<< datum_channels;
-		if (datum_channels > 1 && mean_values_.size() == 1) {
-			// Replicate the mean_value for simplicity
-			for (int c = 1; c < datum_channels; ++c) {
-				mean_values_.push_back(mean_values_[0]);
-			}
-		}
-	}
-
-	int height = datum_height;
-	int width = datum_width;
-
-	int h_off = 0;
-	int w_off = 0;
-	if (crop_size) {
-		height = crop_size;
-		width = crop_size;
-		// We only do random crop when we do training.
-		if (phase_ == TRAIN) {
-			h_off = Rand(datum_height - crop_size + 1);
-			w_off = Rand(datum_width - crop_size + 1);
-		} else {
-			h_off = (datum_height - crop_size) / 2;
-			w_off = (datum_width - crop_size) / 2;
-		}
-	}
-
-	Dtype datum_element;
-	int top_index, data_index;
-	for (int c = 0; c < datum_channels; ++c) {
-		for (int h = 0; h < height; ++h) {
-			for (int w = 0; w < width; ++w) {
-				data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
-				if (do_mirror) {
-					top_index = (c * height + h) * width + (width - 1 - w);
-				} else {
-					top_index = (c * height + h) * width + w;
-				}
-				if (has_uint8) {
-					datum_element =
-							static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
-				} else {
-					datum_element = datum.float_data(data_index);
-				}
-				if (has_mean_file) {
-					transformed_data[top_index] =
-							(datum_element - mean[data_index]) * scale;
-				} else {
-					if (has_mean_values) {
-						transformed_data[top_index] =
-								(datum_element - mean_values_[c]) * scale;
-					} else {
-						transformed_data[top_index] = datum_element * scale;
-					}
-				}
-			}
-		}
-	}
+    Dtype* transformed_data) {
+  const string& data = datum.data();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  const int crop_size = param_.crop_size();
+  const Dtype scale = param_.scale();
+  const bool do_mirror = param_.mirror() && Rand(2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_uint8 = data.size() > 0;
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  CHECK_GT(datum_channels, 0);
+  CHECK_GE(datum_height, crop_size);
+  CHECK_GE(datum_width, crop_size);
+
+  Dtype* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(datum_channels, data_mean_.channels());
+    CHECK_EQ(datum_height, data_mean_.height());
+    CHECK_EQ(datum_width, data_mean_.width());
+    mean = data_mean_.mutable_cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << datum_channels;
+    if (datum_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < datum_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int height = datum_height;
+  int width = datum_width;
+
+  int h_off = 0;
+  int w_off = 0;
+  if (crop_size) {
+    height = crop_size;
+    width = crop_size;
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand(datum_height - crop_size + 1);
+      w_off = Rand(datum_width - crop_size + 1);
+    } else {
+      h_off = (datum_height - crop_size) / 2;
+      w_off = (datum_width - crop_size) / 2;
+    }
+  }
+
+  Dtype datum_element;
+  int top_index, data_index;
+  for (int c = 0; c < datum_channels; ++c) {
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        data_index = (c * datum_height + h_off + h) * datum_width + w_off + w;
+        if (do_mirror) {
+          top_index = (c * height + h) * width + (width - 1 - w);
+        } else {
+          top_index = (c * height + h) * width + w;
+        }
+        if (has_uint8) {
+          datum_element =
+              static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
+        } else {
+          datum_element = datum.float_data(data_index);
+        }
+        if (has_mean_file) {
+          transformed_data[top_index] = (datum_element - mean[data_index])
+              * scale;
+        } else {
+          if (has_mean_values) {
+            transformed_data[top_index] = (datum_element - mean_values_[c])
+                * scale;
+          } else {
+            transformed_data[top_index] = datum_element * scale;
+          }
+        }
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-		Blob<Dtype>* transformed_blob) {
-
-	// If datum is encoded, decoded and transform the cv::image.
-	if (datum.encoded()) {
-		CHECK(!(param_.force_color() && param_.force_gray()))
-				<< "cannot set both force_color and force_gray";
-		cv::Mat cv_img;
-		if (param_.force_color() || param_.force_gray()) {
-			// If force_color then decode in color otherwise decode in gray.
-			cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-		} else {
-			cv_img = DecodeDatumToCVMatNative(datum);
-		}
-		// Transform the cv::image into blob.
-		return Transform(cv_img, transformed_blob);
-	} else {
-		if (param_.force_color() || param_.force_gray()) {
-			LOG(ERROR) << "force_color and force_gray only for encoded datum";
-		}
-	}
-
-	const int crop_size = param_.crop_size();
-	const int datum_channels = datum.channels();
-	const int datum_height = datum.height();
-	const int datum_width = datum.width();
-
-	// Check dimensions.
-	const int channels = transformed_blob->channels();
-	const int height = transformed_blob->height();
-	const int width = transformed_blob->width();
-	const int num = transformed_blob->num();
-
-	CHECK_EQ(channels, datum_channels);
-	CHECK_LE(height, datum_height);
-	CHECK_LE(width, datum_width);
-	CHECK_GE(num, 1);
-
-	if (crop_size) {
-		CHECK_EQ(crop_size, height);
-		CHECK_EQ(crop_size, width);
-	} else {
-		CHECK_EQ(datum_height, height);
-		CHECK_EQ(datum_width, width);
-	}
-
-	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-	Transform(datum, transformed_data);
+    Blob<Dtype>* transformed_blob) {
+
+  // If datum is encoded, decoded and transform the cv::image.
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // Transform the cv::image into blob.
+    return Transform(cv_img, transformed_blob);
+  } else {
+    if (param_.force_color() || param_.force_gray()) {
+      LOG(ERROR) << "force_color and force_gray only for encoded datum";
+    }
+  }
+
+  const int crop_size = param_.crop_size();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+
+  // Check dimensions.
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int num = transformed_blob->num();
+
+  CHECK_EQ(channels, datum_channels);
+  CHECK_LE(height, datum_height);
+  CHECK_LE(width, datum_width);
+  CHECK_GE(num, 1);
+
+  if (crop_size) {
+    CHECK_EQ(crop_size, height);
+    CHECK_EQ(crop_size, width);
+  } else {
+    CHECK_EQ(datum_height, height);
+    CHECK_EQ(datum_width, width);
+  }
+
+  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+  Transform(datum, transformed_data);
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
-		Blob<Dtype>* transformed_blob) {
-	const int datum_num = datum_vector.size();
-	const int num = transformed_blob->num();
-	const int channels = transformed_blob->channels();
-	const int height = transformed_blob->height();
-	const int width = transformed_blob->width();
-
-	CHECK_GT(datum_num, 0) << "There is no datum to add";
-	CHECK_LE(datum_num, num)
-			<<
-			"The size of datum_vector must be no greater than transformed_blob->num()";
-	Blob < Dtype > uni_blob(1, channels, height, width);
-	for (int item_id = 0; item_id < datum_num; ++item_id) {
-		int offset = transformed_blob->offset(item_id);
-		uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-		Transform(datum_vector[item_id], &uni_blob);
-	}
+    Blob<Dtype>* transformed_blob) {
+  const int datum_num = datum_vector.size();
+  const int num = transformed_blob->num();
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+
+  CHECK_GT(datum_num, 0) << "There is no datum to add";
+  CHECK_LE(datum_num, num)
+      << "The size of datum_vector must be no greater than transformed_blob->num()";
+  Blob < Dtype > uni_blob(1, channels, height, width);
+  for (int item_id = 0; item_id < datum_num; ++item_id) {
+    int offset = transformed_blob->offset(item_id);
+    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
+    Transform(datum_vector[item_id], &uni_blob);
+  }
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
-		Blob<Dtype>* transformed_blob) {
-	const int mat_num = mat_vector.size();
-	const int num = transformed_blob->num();
-	const int channels = transformed_blob->channels();
-	const int height = transformed_blob->height();
-	const int width = transformed_blob->width();
-
-	CHECK_GT(mat_num, 0) << "There is no MAT to add";
-	CHECK_EQ(mat_num, num) <<
-			"The size of mat_vector must be equals to transformed_blob->num()";
-	Blob < Dtype > uni_blob(1, channels, height, width);
-	for (int item_id = 0; item_id < mat_num; ++item_id) {
-		int offset = transformed_blob->offset(item_id);
-		uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
-		Transform(mat_vector[item_id], &uni_blob);
-	}
+    Blob<Dtype>* transformed_blob) {
+  const int mat_num = mat_vector.size();
+  const int num = transformed_blob->num();
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+
+  CHECK_GT(mat_num, 0) << "There is no MAT to add";
+  CHECK_EQ(mat_num, num)
+      << "The size of mat_vector must be equals to transformed_blob->num()";
+  Blob < Dtype > uni_blob(1, channels, height, width);
+  for (int item_id = 0; item_id < mat_num; ++item_id) {
+    int offset = transformed_blob->offset(item_id);
+    uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
+    Transform(mat_vector[item_id], &uni_blob);
+  }
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
-		Blob<Dtype>* transformed_blob) {
-	const int crop_size = param_.crop_size();
-	const int img_channels = cv_img.channels();
-	const int img_height = cv_img.rows;
-	const int img_width = cv_img.cols;
-
-	// Check dimensions.
-	const int channels = transformed_blob->channels();
-	const int height = transformed_blob->height();
-	const int width = transformed_blob->width();
-	const int num = transformed_blob->num();
-
-	CHECK_EQ(channels, img_channels);
-	CHECK_LE(height, img_height);
-	CHECK_LE(width, img_width);
-	CHECK_GE(num, 1);
-
-	CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-
-	const Dtype scale = param_.scale();
-	const bool do_mirror = param_.mirror() && Rand(2);
-	const bool has_mean_file = param_.has_mean_file();
-	const bool has_mean_values = mean_values_.size() > 0;
-
-	CHECK_GT(img_channels, 0);
-	CHECK_GE(img_height, crop_size);
-	CHECK_GE(img_width, crop_size);
-
-	Dtype* mean = NULL;
-	if (has_mean_file) {
-		CHECK_EQ(img_channels, data_mean_.channels());
-		CHECK_EQ(img_height, data_mean_.height());
-		CHECK_EQ(img_width, data_mean_.width());
-		mean = data_mean_.mutable_cpu_data();
-	}
-	if (has_mean_values) {
-		CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
-				"Specify either 1 mean_value or as many as channels: " << img_channels;
-		if (img_channels > 1 && mean_values_.size() == 1) {
-			// Replicate the mean_value for simplicity
-			for (int c = 1; c < img_channels; ++c) {
-				mean_values_.push_back(mean_values_[0]);
-			}
-		}
-	}
-
-	int h_off = 0;
-	int w_off = 0;
-	cv::Mat cv_cropped_img = cv_img;
-	if (crop_size) {
-		CHECK_EQ(crop_size, height);
-		CHECK_EQ(crop_size, width);
-		// We only do random crop when we do training.
-		if (phase_ == TRAIN) {
-			h_off = Rand(img_height - crop_size + 1);
-			w_off = Rand(img_width - crop_size + 1);
-		} else {
-			h_off = (img_height - crop_size) / 2;
-			w_off = (img_width - crop_size) / 2;
-		}
-		cv::Rect roi(w_off, h_off, crop_size, crop_size);
-		cv_cropped_img = cv_img(roi);
-	} else {
-		CHECK_EQ(img_height, height);
-		CHECK_EQ(img_width, width);
-	}
-
-	CHECK(cv_cropped_img.data);
-
-	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-	int top_index;
-	for (int h = 0; h < height; ++h) {
-		const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
-		int img_index = 0;
-		for (int w = 0; w < width; ++w) {
-			for (int c = 0; c < img_channels; ++c) {
-				if (do_mirror) {
-					top_index = (c * height + h) * width + (width - 1 - w);
-				} else {
-					top_index = (c * height + h) * width + w;
-				}
-				// int top_index = (c * height + h) * width + w;
-				Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-				if (has_mean_file) {
-					int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
-					transformed_data[top_index] =
-							(pixel - mean[mean_index]) * scale;
-				} else {
-					if (has_mean_values) {
-						transformed_data[top_index] =
-								(pixel - mean_values_[c]) * scale;
-					} else {
-						transformed_data[top_index] = pixel * scale;
-					}
-				}
-			}
-		}
-	}
+    Blob<Dtype>* transformed_blob) {
+  const int crop_size = param_.crop_size();
+  const int img_channels = cv_img.channels();
+  const int img_height = cv_img.rows;
+  const int img_width = cv_img.cols;
+
+  // Check dimensions.
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int num = transformed_blob->num();
+
+  CHECK_EQ(channels, img_channels);
+  CHECK_LE(height, img_height);
+  CHECK_LE(width, img_width);
+  CHECK_GE(num, 1);
+
+  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+
+  const Dtype scale = param_.scale();
+  const bool do_mirror = param_.mirror() && Rand(2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  CHECK_GT(img_channels, 0);
+  CHECK_GE(img_height, crop_size);
+  CHECK_GE(img_width, crop_size);
+
+  Dtype* mean = NULL;
+  if (has_mean_file) {
+    CHECK_EQ(img_channels, data_mean_.channels());
+    CHECK_EQ(img_height, data_mean_.height());
+    CHECK_EQ(img_width, data_mean_.width());
+    mean = data_mean_.mutable_cpu_data();
+  }
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << img_channels;
+    if (img_channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < img_channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
+
+  int h_off = 0;
+  int w_off = 0;
+  cv::Mat cv_cropped_img = cv_img;
+  if (crop_size) {
+    CHECK_EQ(crop_size, height);
+    CHECK_EQ(crop_size, width);
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand(img_height - crop_size + 1);
+      w_off = Rand(img_width - crop_size + 1);
+    } else {
+      h_off = (img_height - crop_size) / 2;
+      w_off = (img_width - crop_size) / 2;
+    }
+    cv::Rect roi(w_off, h_off, crop_size, crop_size);
+    cv_cropped_img = cv_img(roi);
+  } else {
+    CHECK_EQ(img_height, height);
+    CHECK_EQ(img_width, width);
+  }
+
+  CHECK(cv_cropped_img.data);
+
+  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+  int top_index;
+  for (int h = 0; h < height; ++h) {
+    const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
+    int img_index = 0;
+    for (int w = 0; w < width; ++w) {
+      for (int c = 0; c < img_channels; ++c) {
+        if (do_mirror) {
+          top_index = (c * height + h) * width + (width - 1 - w);
+        } else {
+          top_index = (c * height + h) * width + w;
+        }
+        // int top_index = (c * height + h) * width + w;
+        Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
+        if (has_mean_file) {
+          int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
+          transformed_data[top_index] = (pixel - mean[mean_index]) * scale;
+        } else {
+          if (has_mean_values) {
+            transformed_data[top_index] = (pixel - mean_values_[c]) * scale;
+          } else {
+            transformed_data[top_index] = pixel * scale;
+          }
+        }
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
-		Blob<Dtype>* transformed_blob) {
-	const int crop_size = param_.crop_size();
-	const int input_num = input_blob->num();
-	const int input_channels = input_blob->channels();
-	const int input_height = input_blob->height();
-	const int input_width = input_blob->width();
-
-	if (transformed_blob->count() == 0) {
-		// Initialize transformed_blob with the right shape.
-		if (crop_size) {
-			transformed_blob->Reshape(input_num, input_channels,
-					crop_size, crop_size);
-		} else {
-			transformed_blob->Reshape(input_num, input_channels,
-					input_height, input_width);
-		}
-	}
-
-	const int num = transformed_blob->num();
-	const int channels = transformed_blob->channels();
-	const int height = transformed_blob->height();
-	const int width = transformed_blob->width();
-	const int size = transformed_blob->count();
-
-	CHECK_LE(input_num, num);
-	CHECK_EQ(input_channels, channels);
-	CHECK_GE(input_height, height);
-	CHECK_GE(input_width, width);
-
-	const Dtype scale = param_.scale();
-	const bool do_mirror = param_.mirror() && Rand(2);
-	const bool has_mean_file = param_.has_mean_file();
-	const bool has_mean_values = mean_values_.size() > 0;
-
-	int h_off = 0;
-	int w_off = 0;
-	if (crop_size) {
-		CHECK_EQ(crop_size, height);
-		CHECK_EQ(crop_size, width);
-		// We only do random crop when we do training.
-		if (phase_ == TRAIN) {
-			h_off = Rand(input_height - crop_size + 1);
-			w_off = Rand(input_width - crop_size + 1);
-		} else {
-			h_off = (input_height - crop_size) / 2;
-			w_off = (input_width - crop_size) / 2;
-		}
-	} else {
-		CHECK_EQ(input_height, height);
-		CHECK_EQ(input_width, width);
-	}
-
-	Dtype* input_data = input_blob->mutable_cpu_data();
-	if (has_mean_file) {
-		CHECK_EQ(input_channels, data_mean_.channels());
-		CHECK_EQ(input_height, data_mean_.height());
-		CHECK_EQ(input_width, data_mean_.width());
-		for (int n = 0; n < input_num; ++n) {
-			int offset = input_blob->offset(n);
-			caffe_sub(data_mean_.count(), input_data + offset,
-					data_mean_.cpu_data(), input_data + offset);
-		}
-	}
-
-	if (has_mean_values) {
-		CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
-				"Specify either 1 mean_value or as many as channels: "
-				<< input_channels;
-		if (mean_values_.size() == 1) {
-			caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
-		} else {
-			for (int n = 0; n < input_num; ++n) {
-				for (int c = 0; c < input_channels; ++c) {
-					int offset = input_blob->offset(n, c);
-					caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
-							input_data + offset);
-				}
-			}
-		}
-	}
-
-	Dtype* transformed_data = transformed_blob->mutable_cpu_data();
-
-	for (int n = 0; n < input_num; ++n) {
-		int top_index_n = n * channels;
-		int data_index_n = n * channels;
-		for (int c = 0; c < channels; ++c) {
-			int top_index_c = (top_index_n + c) * height;
-			int data_index_c = (data_index_n + c) * input_height + h_off;
-			for (int h = 0; h < height; ++h) {
-				int top_index_h = (top_index_c + h) * width;
-				int data_index_h = (data_index_c + h) * input_width + w_off;
-				if (do_mirror) {
-					int top_index_w = top_index_h + width - 1;
-					for (int w = 0; w < width; ++w) {
-						transformed_data[top_index_w - w] = input_data[data_index_h + w];
-					}
-				} else {
-					for (int w = 0; w < width; ++w) {
-						transformed_data[top_index_h + w] = input_data[data_index_h + w];
-					}
-				}
-			}
-		}
-	}
-	if (scale != Dtype(1)) {
-		DLOG(INFO) << "Scale: " << scale;
-		caffe_scal(size, scale, transformed_data);
-	}
+    Blob<Dtype>* transformed_blob) {
+  const int crop_size = param_.crop_size();
+  const int input_num = input_blob->num();
+  const int input_channels = input_blob->channels();
+  const int input_height = input_blob->height();
+  const int input_width = input_blob->width();
+
+  if (transformed_blob->count() == 0) {
+    // Initialize transformed_blob with the right shape.
+    if (crop_size) {
+      transformed_blob->Reshape(input_num, input_channels, crop_size,
+          crop_size);
+    } else {
+      transformed_blob->Reshape(input_num, input_channels, input_height,
+          input_width);
+    }
+  }
+
+  const int num = transformed_blob->num();
+  const int channels = transformed_blob->channels();
+  const int height = transformed_blob->height();
+  const int width = transformed_blob->width();
+  const int size = transformed_blob->count();
+
+  CHECK_LE(input_num, num);
+  CHECK_EQ(input_channels, channels);
+  CHECK_GE(input_height, height);
+  CHECK_GE(input_width, width);
+
+  const Dtype scale = param_.scale();
+  const bool do_mirror = param_.mirror() && Rand(2);
+  const bool has_mean_file = param_.has_mean_file();
+  const bool has_mean_values = mean_values_.size() > 0;
+
+  int h_off = 0;
+  int w_off = 0;
+  if (crop_size) {
+    CHECK_EQ(crop_size, height);
+    CHECK_EQ(crop_size, width);
+    // We only do random crop when we do training.
+    if (phase_ == TRAIN) {
+      h_off = Rand(input_height - crop_size + 1);
+      w_off = Rand(input_width - crop_size + 1);
+    } else {
+      h_off = (input_height - crop_size) / 2;
+      w_off = (input_width - crop_size) / 2;
+    }
+  } else {
+    CHECK_EQ(input_height, height);
+    CHECK_EQ(input_width, width);
+  }
+
+  Dtype* input_data = input_blob->mutable_cpu_data();
+  if (has_mean_file) {
+    CHECK_EQ(input_channels, data_mean_.channels());
+    CHECK_EQ(input_height, data_mean_.height());
+    CHECK_EQ(input_width, data_mean_.width());
+    for (int n = 0; n < input_num; ++n) {
+      int offset = input_blob->offset(n);
+      caffe_sub(data_mean_.count(), input_data + offset, data_mean_.cpu_data(),
+          input_data + offset);
+    }
+  }
+
+  if (has_mean_values) {
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << input_channels;
+    if (mean_values_.size() == 1) {
+      caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
+    } else {
+      for (int n = 0; n < input_num; ++n) {
+        for (int c = 0; c < input_channels; ++c) {
+          int offset = input_blob->offset(n, c);
+          caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
+              input_data + offset);
+        }
+      }
+    }
+  }
+
+  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
+
+  for (int n = 0; n < input_num; ++n) {
+    int top_index_n = n * channels;
+    int data_index_n = n * channels;
+    for (int c = 0; c < channels; ++c) {
+      int top_index_c = (top_index_n + c) * height;
+      int data_index_c = (data_index_n + c) * input_height + h_off;
+      for (int h = 0; h < height; ++h) {
+        int top_index_h = (top_index_c + h) * width;
+        int data_index_h = (data_index_c + h) * input_width + w_off;
+        if (do_mirror) {
+          int top_index_w = top_index_h + width - 1;
+          for (int w = 0; w < width; ++w) {
+            transformed_data[top_index_w - w] = input_data[data_index_h + w];
+          }
+        } else {
+          for (int w = 0; w < width; ++w) {
+            transformed_data[top_index_h + w] = input_data[data_index_h + w];
+          }
+        }
+      }
+    }
+  }
+  if (scale != Dtype(1)) {
+    DLOG(INFO) << "Scale: " << scale;
+    caffe_scal(size, scale, transformed_data);
+  }
 }
 
 template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
-	if (datum.encoded()) {
-		CHECK(!(param_.force_color() && param_.force_gray()))
-				<< "cannot set both force_color and force_gray";
-		cv::Mat cv_img;
-		if (param_.force_color() || param_.force_gray()) {
-			// If force_color then decode in color otherwise decode in gray.
-			cv_img = DecodeDatumToCVMat(datum, param_.force_color());
-		} else {
-			cv_img = DecodeDatumToCVMatNative(datum);
-		}
-		// InferBlobShape using the cv::image.
-		return InferBlobShape(cv_img);
-	}
-
-	const int crop_size = param_.crop_size();
-	const int datum_channels = datum.channels();
-	const int datum_height = datum.height();
-	const int datum_width = datum.width();
-	// Check dimensions.
-	CHECK_GT(datum_channels, 0);
-	CHECK_GE(datum_height, crop_size);
-	CHECK_GE(datum_width, crop_size);
-	// Build BlobShape.
-	vector<int> shape(4);
-	shape[0] = 1;
-	shape[1] = datum_channels;
-	shape[2] = (crop_size) ? crop_size : datum_height;
-	shape[3] = (crop_size) ? crop_size : datum_width;
-	return shape;
+  if (datum.encoded()) {
+    CHECK(!(param_.force_color() && param_.force_gray()))
+        << "cannot set both force_color and force_gray";
+    cv::Mat cv_img;
+    if (param_.force_color() || param_.force_gray()) {
+      // If force_color then decode in color otherwise decode in gray.
+      cv_img = DecodeDatumToCVMat(datum, param_.force_color());
+    } else {
+      cv_img = DecodeDatumToCVMatNative(datum);
+    }
+    // InferBlobShape using the cv::image.
+    return InferBlobShape(cv_img);
+  }
+
+  const int crop_size = param_.crop_size();
+  const int datum_channels = datum.channels();
+  const int datum_height = datum.height();
+  const int datum_width = datum.width();
+  // Check dimensions.
+  CHECK_GT(datum_channels, 0);
+  CHECK_GE(datum_height, crop_size);
+  CHECK_GE(datum_width, crop_size);
+  // Build BlobShape.
+  vector<int> shape(4);
+  shape[0] = 1;
+  shape[1] = datum_channels;
+  shape[2] = (crop_size) ? crop_size : datum_height;
+  shape[3] = (crop_size) ? crop_size : datum_width;
+  return shape;
 }
 
 template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-		const vector<Datum> & datum_vector) {
-	const int num = datum_vector.size();
-	CHECK_GT(num, 0) << "There is no datum to in the vector";
-	// Use first datum in the vector to InferBlobShape.
-	vector<int> shape = InferBlobShape(datum_vector[0]);
-	// Adjust num to the size of the vector.
-	shape[0] = num;
-	return shape;
+    const vector<Datum> & datum_vector) {
+  const int num = datum_vector.size();
+  CHECK_GT(num, 0) << "There is no datum to in the vector";
+  // Use first datum in the vector to InferBlobShape.
+  vector<int> shape = InferBlobShape(datum_vector[0]);
+  // Adjust num to the size of the vector.
+  shape[0] = num;
+  return shape;
 }
 
 template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
-	const int crop_size = param_.crop_size();
-	const int img_channels = cv_img.channels();
-	const int img_height = cv_img.rows;
-	const int img_width = cv_img.cols;
-	// Check dimensions.
-	CHECK_GT(img_channels, 0);
-	CHECK_GE(img_height, crop_size);
-	CHECK_GE(img_width, crop_size);
-	// Build BlobShape.
-	vector<int> shape(4);
-	shape[0] = 1;
-	shape[1] = img_channels;
-	shape[2] = (crop_size) ? crop_size : img_height;
-	shape[3] = (crop_size) ? crop_size : img_width;
-	return shape;
+  const int crop_size = param_.crop_size();
+  const int img_channels = cv_img.channels();
+  const int img_height = cv_img.rows;
+  const int img_width = cv_img.cols;
+  // Check dimensions.
+  CHECK_GT(img_channels, 0);
+  CHECK_GE(img_height, crop_size);
+  CHECK_GE(img_width, crop_size);
+  // Build BlobShape.
+  vector<int> shape(4);
+  shape[0] = 1;
+  shape[1] = img_channels;
+  shape[2] = (crop_size) ? crop_size : img_height;
+  shape[3] = (crop_size) ? crop_size : img_width;
+  return shape;
 }
 
 template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
-		const vector<cv::Mat> & mat_vector) {
-	const int num = mat_vector.size();
-	CHECK_GT(num, 0) << "There is no cv_img to in the vector";
-	// Use first cv_img in the vector to InferBlobShape.
-	vector<int> shape = InferBlobShape(mat_vector[0]);
-	// Adjust num to the size of the vector.
-	shape[0] = num;
-	return shape;
+    const vector<cv::Mat> & mat_vector) {
+  const int num = mat_vector.size();
+  CHECK_GT(num, 0) << "There is no cv_img to in the vector";
+  // Use first cv_img in the vector to InferBlobShape.
+  vector<int> shape = InferBlobShape(mat_vector[0]);
+  // Adjust num to the size of the vector.
+  shape[0] = num;
+  return shape;
 }
 
 template <typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
-	const bool needs_rand = param_.mirror() ||
-			(phase_ == TRAIN && param_.crop_size());
-	if (needs_rand) {
-		const unsigned int rng_seed = caffe_rng_rand();
-		rng_.reset(new Caffe::RNG(rng_seed));
-	} else {
-		rng_.reset();
-	}
+  const bool needs_rand = param_.mirror()
+      || (phase_ == TRAIN && param_.crop_size());
+  if (needs_rand) {
+    const unsigned int rng_seed = caffe_rng_rand();
+    rng_.reset(new Caffe::RNG(rng_seed));
+  } else {
+    rng_.reset();
+  }
 }
 
 template <typename Dtype>
 int DataTransformer<Dtype>::Rand(int n) {
-	CHECK (rng_);
-	CHECK_GT(n, 0);
-	caffe::rng_t* rng =
-			static_cast<caffe::rng_t*>(rng_->generator());
-	return ((*rng)() % n);
+  CHECK (rng_);
+  CHECK_GT(n, 0);
+  caffe::rng_t* rng = static_cast<caffe::rng_t*>(rng_->generator());
+  return ((*rng)() % n);
 }
 
 INSTANTIATE_CLASS (DataTransformer);
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index 9e53a66a..bb8f9cb6 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -38,414 +38,386 @@ std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
 
 Device::~Device() {
-	ReleaseKernels();
-	free((void*) platformIDs);
-	free (DeviceIDs);
-	clReleaseProgram (Program);
-	clReleaseCommandQueue (CommandQueue);
-	clReleaseCommandQueue (CommandQueue_helper);
-	clReleaseContext (Context);
-	LOG(INFO) << "device destructor";
+  ReleaseKernels();
+  free((void*) platformIDs);
+  free (DeviceIDs);
+  clReleaseProgram (Program);
+  clReleaseCommandQueue (CommandQueue);
+  clReleaseCommandQueue (CommandQueue_helper);
+  clReleaseContext (Context);
+  LOG(INFO) << "device destructor";
 }
 
 cl_int Device::Init(int deviceId) {
 
-	DisplayPlatformInfo();
-
-	clGetPlatformIDs(0, NULL, &numPlatforms);
-	cl_platform_id PlatformIDs[numPlatforms];
-	clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
-
-	size_t nameLen;
-	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
-			platformName, &nameLen);
-	if (res != CL_SUCCESS) {
-		fprintf(stderr, "Err: Failed to Get Platform Info\n");
-		return 0;
-	}
-	platformName[nameLen] = 0;
-
-	GetDeviceInfo();
-	cl_uint uiNumDevices;
-	cl_bool unified_memory = false;
-	clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-	uiNumDevices = numDevices;
-	if (0 == uiNumDevices) {
-		LOG(FATAL) << "Err: No GPU devices";
-	} else {
-		pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id));
-		OCL_CHECK(
-				clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices,
-						pDevices,
-						&uiNumDevices));
-		if (deviceId == -1) {
-			int i;
-			for (i = 0; i < (int) uiNumDevices; i++) {
-				clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY,
-						sizeof(cl_bool), &unified_memory, NULL);
-				if (!unified_memory) { //skip iGPU
-					//we pick the first dGPU we found
-					pDevices[0] = pDevices[i];
-					device_id = i;
-					LOG(INFO) << "Picked default device type : dGPU " << device_id;
-					break;
-				}
-			}
-			if (i == uiNumDevices) {
-				LOG(FATAL) << "Cannot find any dGPU! ";
-			}
-		} else if (deviceId >= 0 && deviceId < uiNumDevices) {
-			pDevices[0] = pDevices[deviceId];
-			device_id = deviceId;
-			LOG(INFO) << "Picked device type : GPU " << device_id;
-		} else {
-			LOG(FATAL) << "  Invalid GPU deviceId! ";
-		}
-	}
-
-	Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
-	if (NULL == Context) {
-		fprintf(stderr, "Err: Failed to Create Context\n");
-		return 0;
-	}
-	CommandQueue = clCreateCommandQueue(Context, pDevices[0],
-			CL_QUEUE_PROFILING_ENABLE, NULL);
-	CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0],
-			CL_QUEUE_PROFILING_ENABLE, NULL);
-	if (NULL == CommandQueue || NULL == CommandQueue_helper) {
-		fprintf(stderr, "Err: Failed to Create Commandqueue\n");
-		return 0;
-	}
-	BuildProgram (oclKernelPath);
-	row = clblasRowMajor;
-	col = clblasColumnMajor;
-	return 0;
+  DisplayPlatformInfo();
+
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  cl_platform_id PlatformIDs[numPlatforms];
+  clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+  size_t nameLen;
+  cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+      platformName, &nameLen);
+  if (res != CL_SUCCESS) {
+    fprintf(stderr, "Err: Failed to Get Platform Info\n");
+    return 0;
+  }
+  platformName[nameLen] = 0;
+
+  GetDeviceInfo();
+  cl_uint uiNumDevices;
+  cl_bool unified_memory = false;
+  clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+  uiNumDevices = numDevices;
+  if (0 == uiNumDevices) {
+    LOG(FATAL) << "Err: No GPU devices";
+  } else {
+    pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id));
+    OCL_CHECK(
+        clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices,
+            pDevices, &uiNumDevices));
+    if (deviceId == -1) {
+      int i;
+      for (i = 0; i < (int) uiNumDevices; i++) {
+        clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY,
+            sizeof(cl_bool), &unified_memory, NULL);
+        if (!unified_memory) { //skip iGPU
+          //we pick the first dGPU we found
+          pDevices[0] = pDevices[i];
+          device_id = i;
+          LOG(INFO) << "Picked default device type : dGPU " << device_id;
+          break;
+        }
+      }
+      if (i == uiNumDevices) {
+        LOG(FATAL) << "Cannot find any dGPU! ";
+      }
+    } else if (deviceId >= 0 && deviceId < uiNumDevices) {
+      pDevices[0] = pDevices[deviceId];
+      device_id = deviceId;
+      LOG(INFO) << "Picked device type : GPU " << device_id;
+    } else {
+      LOG(FATAL) << "  Invalid GPU deviceId! ";
+    }
+  }
+
+  Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
+  if (NULL == Context) {
+    fprintf(stderr, "Err: Failed to Create Context\n");
+    return 0;
+  }
+  CommandQueue = clCreateCommandQueue(Context, pDevices[0],
+      CL_QUEUE_PROFILING_ENABLE, NULL);
+  CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0],
+      CL_QUEUE_PROFILING_ENABLE, NULL);
+  if (NULL == CommandQueue || NULL == CommandQueue_helper) {
+    fprintf(stderr, "Err: Failed to Create Commandqueue\n");
+    return 0;
+  }
+  BuildProgram (oclKernelPath);
+  row = clblasRowMajor;
+  col = clblasColumnMajor;
+  return 0;
 }
 
-void Device::BuildProgram(std::string kernel_dir)
-		{
-	std::string strSource = "";
-	DIR *ocl_dir;
-	struct dirent *dirp;
-	if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL)
-			{
-		fprintf(stderr, "Err: Open ocl dir failed!\n");
-	}
-	while ((dirp = readdir(ocl_dir)) != NULL)
-	{
-		//Ignore hidden files
-		if (dirp->d_name[0] == '.')
-			continue;
-		std::string file_name = std::string(dirp->d_name);
-		//Skip non *.cl files
-		size_t last_dot_pos = file_name.find_last_of(".");
-		if (file_name.substr(last_dot_pos + 1) != "cl")
-			continue;
-
-		std::string ocl_kernel_full_path = kernel_dir + file_name;
-		std::string tmpSource = "";
-		ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
-		strSource += tmpSource;
-	}
-	const char *pSource;
-	pSource = strSource.c_str();
-	size_t uiArrSourceSize[] = { 0 };
-	uiArrSourceSize[0] = strlen(pSource);
-	Program = NULL;
-	Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize,
-			NULL);
-	if (NULL == Program) {
-		fprintf(stderr, "Err: Failed to create program\n");
-	}
-	cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(),
-			NULL, NULL);
-	LOG(INFO) << "Build Program";
-	if (CL_SUCCESS != iStatus) {
-		fprintf(stderr, "Err: Failed to build program\n");
-		char szBuildLog[16384];
-		clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG,
-				sizeof(szBuildLog), szBuildLog, NULL);
-		std::cout << szBuildLog;
-		clReleaseProgram (Program);
-	}
+void Device::BuildProgram(std::string kernel_dir) {
+  std::string strSource = "";
+  DIR *ocl_dir;
+  struct dirent *dirp;
+  if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) {
+    fprintf(stderr, "Err: Open ocl dir failed!\n");
+  }
+  while ((dirp = readdir(ocl_dir)) != NULL) {
+    //Ignore hidden files
+    if (dirp->d_name[0] == '.')
+      continue;
+    std::string file_name = std::string(dirp->d_name);
+    //Skip non *.cl files
+    size_t last_dot_pos = file_name.find_last_of(".");
+    if (file_name.substr(last_dot_pos + 1) != "cl")
+      continue;
+
+    std::string ocl_kernel_full_path = kernel_dir + file_name;
+    std::string tmpSource = "";
+    ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
+    strSource += tmpSource;
+  }
+  const char *pSource;
+  pSource = strSource.c_str();
+  size_t uiArrSourceSize[] = { 0 };
+  uiArrSourceSize[0] = strlen(pSource);
+  Program = NULL;
+  Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize,
+      NULL);
+  if (NULL == Program) {
+    fprintf(stderr, "Err: Failed to create program\n");
+  }
+  cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(),
+      NULL, NULL);
+  LOG(INFO) << "Build Program";
+  if (CL_SUCCESS != iStatus) {
+    fprintf(stderr, "Err: Failed to build program\n");
+    char szBuildLog[16384];
+    clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG,
+        sizeof(szBuildLog), szBuildLog, NULL);
+    std::cout << szBuildLog;
+    clReleaseProgram (Program);
+  }
 }
 
 //Use to read OpenCL source code
 cl_int Device::ConvertToString(std::string pFileName, std::string &Str) {
-	size_t uiSize = 0;
-	size_t uiFileSize = 0;
-	char *pStr = NULL;
-	char *tmp = (char*) pFileName.data();
-	std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary));
-	if (fFile.is_open()) {
-		fFile.seekg(0, std::fstream::end);
-		uiSize = uiFileSize = (size_t) fFile.tellg();
-		fFile.seekg(0, std::fstream::beg);
-		pStr = new char[uiSize + 1];
-
-		if (NULL == pStr) {
-			fFile.close();
-			return 0;
-		}
-		fFile.read(pStr, uiFileSize);
-		fFile.close();
-		pStr[uiSize] = '\0';
-		Str = pStr;
-		delete[] pStr;
-		return 0;
-	}
-	LOG(ERROR) << "Err: Failed to open cl file!";
-	return -1;
+  size_t uiSize = 0;
+  size_t uiFileSize = 0;
+  char *pStr = NULL;
+  char *tmp = (char*) pFileName.data();
+  std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary));
+  if (fFile.is_open()) {
+    fFile.seekg(0, std::fstream::end);
+    uiSize = uiFileSize = (size_t) fFile.tellg();
+    fFile.seekg(0, std::fstream::beg);
+    pStr = new char[uiSize + 1];
+
+    if (NULL == pStr) {
+      fFile.close();
+      return 0;
+    }
+    fFile.read(pStr, uiFileSize);
+    fFile.close();
+    pStr[uiSize] = '\0';
+    Str = pStr;
+    delete[] pStr;
+    return 0;
+  }
+  LOG(ERROR) << "Err: Failed to open cl file!";
+  return -1;
 }
 
-cl_kernel Device::GetKernel(std::string kernel_name)
-		{
-	std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
-	if (it == Kernels.end())
-			{
-		cl_int _err = 0;
-		cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err);
-		OCL_CHECK(_err);
-		Kernels[kernel_name] = kernel;
-	}
-	return Kernels[kernel_name];
+cl_kernel Device::GetKernel(std::string kernel_name) {
+  std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
+  if (it == Kernels.end()) {
+    cl_int _err = 0;
+    cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err);
+    OCL_CHECK(_err);
+    Kernels[kernel_name] = kernel;
+  }
+  return Kernels[kernel_name];
 }
 
-void Device::ReleaseKernels()
-{
-	std::map<std::string, cl_kernel>::iterator it;
-	for (it = Kernels.begin(); it != Kernels.end(); it++)
-			{
-		clReleaseKernel(it->second);
-	}
+void Device::ReleaseKernels() {
+  std::map<std::string, cl_kernel>::iterator it;
+  for (it = Kernels.begin(); it != Kernels.end(); it++) {
+    clReleaseKernel(it->second);
+  }
 }
 
 void Device::DisplayPlatformInfo() {
-	cl_int err;
-
-	err = clGetPlatformIDs(0, NULL, &numPlatforms);
-	if (err != CL_SUCCESS || numPlatforms <= 0)
-			{
-		LOG(ERROR) << "Failed to find any OpenCL platform.";
-		return;
-	}
-
-	platformIDs = (cl_platform_id *) malloc(
-			sizeof(cl_platform_id) * numPlatforms);
-	err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
-	if (err != CL_SUCCESS)
-			{
-		LOG(ERROR) << "Failed to find any OpenCL platform.";
-		return;
-	}
-
-	LOG(INFO) << "Number of platforms found:" << numPlatforms;
-
-	//iterate through the list of platforms displaying platform information
-	for (cl_uint i = 0; i < numPlatforms; i++) {
-		DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
-		DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
-		DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
-		DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
-		DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS,
-				"CL_PLATFORM_EXTENSIONS");
-	}
+  cl_int err;
+
+  err = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (err != CL_SUCCESS || numPlatforms <= 0) {
+    LOG(ERROR) << "Failed to find any OpenCL platform.";
+    return;
+  }
+
+  platformIDs = (cl_platform_id *) malloc(
+      sizeof(cl_platform_id) * numPlatforms);
+  err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find any OpenCL platform.";
+    return;
+  }
+
+  LOG(INFO) << "Number of platforms found:" << numPlatforms;
+
+  //iterate through the list of platforms displaying platform information
+  for (cl_uint i = 0; i < numPlatforms; i++) {
+    DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS,
+        "CL_PLATFORM_EXTENSIONS");
+  }
 
 }
 
 void Device::DisplayInfo(cl_platform_id id, cl_platform_info name,
-		std::string str) {
-	cl_int err;
-	std::size_t paramValueSize;
-
-	err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);
-	if (err != CL_SUCCESS)
-			{
-		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
-		return;
-	}
-
-	char * info = (char *) alloca(sizeof(char) * paramValueSize);
-	err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
-	if (err != CL_SUCCESS)
-			{
-		LOG(ERROR) << "Failed to find OpenCL platform:" << str;
-		return;
-	}
-
-	LOG(INFO) << "\t" << str << "\t" << info;
+    std::string str) {
+  cl_int err;
+  std::size_t paramValueSize;
+
+  err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+    return;
+  }
+
+  char * info = (char *) alloca(sizeof(char) * paramValueSize);
+  err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+    return;
+  }
+
+  LOG(INFO) << "\t" << str << "\t" << info;
 }
 
 void Device::GetDeviceInfo() {
-	cl_int err;
-	//by default, we select the first platform. can be extended for more platforms
-	//query GPU device for now
-	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL,
-			&numDevices);
-	// we allow program run if no GPU is found. Just return. No error reported.
-	if (numDevices < 1)
-			{
-		LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
-		LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
-		return;
-	}
-
-	DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
-	err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices,
-			DeviceIDs, NULL);
-	if (err != CL_SUCCESS)
-			{
-		LOG(INFO) << "Failed to find any GPU devices.";
-		return;
-	}
-
-	LOG(INFO) << "Number of devices found:" << numDevices;
-	for (cl_uint i = 0; i < numDevices; i++) {
-		LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i];
-		DisplayDeviceInfo < cl_device_type
-				> (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
-		DisplayDeviceInfo < cl_bool
-				> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
-		DisplayDeviceInfo < cl_uint
-				> (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
-		DisplayDeviceInfo < cl_bool
-				> (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
-		DisplayDeviceInfo < cl_bool
-				> (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
-		DisplayDeviceInfo < cl_bool
-				> (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
-		DisplayDeviceInfo < cl_uint
-				> (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
-		DisplayDeviceInfo < size_t
-				> (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
-		DisplayDeviceInfo < cl_uint
-				> (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
-		DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES,
-				"Max work item sizes");
-		DisplayDeviceInfo < cl_command_queue_properties
-				> (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
-		DisplayDeviceInfo < cl_device_exec_capabilities
-				> (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
-		DisplayDeviceInfo < cl_ulong
-				> (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
-		DisplayDeviceInfo < cl_ulong
-				> (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
-		DisplayDeviceInfo < cl_ulong
-				> (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
-	}
+  cl_int err;
+  //by default, we select the first platform. can be extended for more platforms
+  //query GPU device for now
+  err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL,
+      &numDevices);
+  // we allow program run if no GPU is found. Just return. No error reported.
+  if (numDevices < 1) {
+    LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
+    LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
+    return;
+  }
+
+  DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
+  err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices,
+      DeviceIDs, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(INFO) << "Failed to find any GPU devices.";
+    return;
+  }
+
+  LOG(INFO) << "Number of devices found:" << numDevices;
+  for (cl_uint i = 0; i < numDevices; i++) {
+    LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i];
+    DisplayDeviceInfo < cl_device_type
+        > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+    DisplayDeviceInfo < size_t
+        > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+    DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES,
+        "Max work item sizes");
+    DisplayDeviceInfo < cl_command_queue_properties
+        > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+    DisplayDeviceInfo < cl_device_exec_capabilities
+        > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+  }
 
 }
 
-void Device::DeviceQuery()
-{
-	DisplayPlatformInfo();
+void Device::DeviceQuery() {
+  DisplayPlatformInfo();
 
-	clGetPlatformIDs(0, NULL, &numPlatforms);
-	cl_platform_id PlatformIDs[numPlatforms];
-	clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  cl_platform_id PlatformIDs[numPlatforms];
+  clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
 
-	size_t nameLen;
-	cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
-			platformName, &nameLen);
-	if (res != CL_SUCCESS) {
-		fprintf(stderr, "Err: Failed to Get Platform Info\n");
-		return;
-	}
-	platformName[nameLen] = 0;
+  size_t nameLen;
+  cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+      platformName, &nameLen);
+  if (res != CL_SUCCESS) {
+    fprintf(stderr, "Err: Failed to Get Platform Info\n");
+    return;
+  }
+  platformName[nameLen] = 0;
 
-	GetDeviceInfo();
+  GetDeviceInfo();
 }
 
 template <typename T>
 void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
-		std::string str) {
-	cl_int err;
-	std::size_t paramValueSize;
-
-	err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);
-	if (err != CL_SUCCESS)
-			{
-		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
-		return;
-	}
-
-	std::string content;
-	T * info = (T *) alloca(sizeof(T) * paramValueSize);
-	err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
-	if (err != CL_SUCCESS)
-			{
-		LOG(ERROR) << "Failed to find OpenCL device info:" << str;
-		return;
-	}
-
-	switch (name) {
-		case CL_DEVICE_TYPE:
-			{
-			std::string deviceType;
-			appendBitfield < cl_device_type
-					> (
-					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
-
-			appendBitfield < cl_device_type
-					> (
-					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
-
-			appendBitfield < cl_device_type
-					> (
-					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
-
-			appendBitfield < cl_device_type
-					> (
-					*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
-
-			LOG(INFO) << "\t " << str << ":\t" << deviceType;
-		}
-			break;
-		case CL_DEVICE_EXECUTION_CAPABILITIES:
-			{
-			std::string memType;
-			appendBitfield < cl_device_exec_capabilities
-					> (
-					*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
-
-			appendBitfield < cl_device_exec_capabilities
-					> (
-					*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
-
-			LOG(INFO) << "\t " << str << ":\t" << memType;
-
-		}
-			break;
-		case CL_DEVICE_QUEUE_PROPERTIES:
-			{
-			std::string memType;
-			appendBitfield < cl_device_exec_capabilities
-					> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
-
-			appendBitfield < cl_device_exec_capabilities
-					> (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
-
-			LOG(INFO) << "\t " << str << ":\t" << memType;
-		}
-			break;
-		default:
-			LOG(INFO) << "\t" << str << ":\t" << *info;
-			break;
-	}
+    std::string str) {
+  cl_int err;
+  std::size_t paramValueSize;
+
+  err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+    return;
+  }
+
+  std::string content;
+  T * info = (T *) alloca(sizeof(T) * paramValueSize);
+  err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+    return;
+  }
+
+  switch (name) {
+  case CL_DEVICE_TYPE: {
+    std::string deviceType;
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
+
+    LOG(INFO) << "\t " << str << ":\t" << deviceType;
+  }
+    break;
+  case CL_DEVICE_EXECUTION_CAPABILITIES: {
+    std::string memType;
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
+
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
+
+    LOG(INFO) << "\t " << str << ":\t" << memType;
+
+  }
+    break;
+  case CL_DEVICE_QUEUE_PROPERTIES: {
+    std::string memType;
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
+
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
+
+    LOG(INFO) << "\t " << str << ":\t" << memType;
+  }
+    break;
+  default:
+    LOG(INFO) << "\t" << str << ":\t" << *info;
+    break;
+  }
 
 }
 
 template <typename T>
-void Device::appendBitfield(T info, T value, std::string name, std::string &str)
-		{
-	if (info & value)
-			{
-		if (str.length() > 0)
-				{
-			str.append(" | ");
-		}
-		str.append(name);
-	}
+void Device::appendBitfield(T info, T value, std::string name,
+    std::string &str) {
+  if (info & value) {
+    if (str.length() > 0) {
+      str.append(" | ");
+    }
+    str.append(name);
+  }
 }
 
 }  // namespace caffe
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index ba302ba8..fb512847 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -4,36 +4,36 @@
 namespace caffe {
 
 InternalThread::~InternalThread() {
-	WaitForInternalThreadToExit();
+  WaitForInternalThreadToExit();
 }
 
 bool InternalThread::is_started() const {
-	return thread_.get() != NULL && thread_->joinable();
+  return thread_.get() != NULL && thread_->joinable();
 }
 
 bool InternalThread::StartInternalThread() {
-	if (!WaitForInternalThreadToExit()) {
-		return false;
-	}
-	try {
-		thread_.reset(
-				new boost::thread(&InternalThread::InternalThreadEntry, this));
-	} catch (...) {
-		return false;
-	}
-	return true;
+  if (!WaitForInternalThreadToExit()) {
+    return false;
+  }
+  try {
+    thread_.reset(
+        new boost::thread(&InternalThread::InternalThreadEntry, this));
+  } catch (...) {
+    return false;
+  }
+  return true;
 }
 
 /** Will not return until the internal thread has exited. */
 bool InternalThread::WaitForInternalThreadToExit() {
-	if (is_started()) {
-		try {
-			thread_->join();
-		} catch (...) {
-			return false;
-		}
-	}
-	return true;
+  if (is_started()) {
+    try {
+      thread_->join();
+    } catch (...) {
+      return false;
+    }
+  }
+  return true;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index a720ee92..44233c98 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -18,24 +18,23 @@ namespace caffe {
 
 // Get convolution layer according to engine.
 template <typename Dtype>
-shared_ptr<Layer<Dtype> > GetConvolutionLayer(
-		const LayerParameter& param) {
-	ConvolutionParameter_Engine engine = param.convolution_param().engine();
-	if (engine == ConvolutionParameter_Engine_DEFAULT) {
-		engine = ConvolutionParameter_Engine_CAFFE;
+shared_ptr<Layer<Dtype> > GetConvolutionLayer(const LayerParameter& param) {
+  ConvolutionParameter_Engine engine = param.convolution_param().engine();
+  if (engine == ConvolutionParameter_Engine_DEFAULT) {
+    engine = ConvolutionParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = ConvolutionParameter_Engine_CUDNN;
+    engine = ConvolutionParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == ConvolutionParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new ConvolutionLayer<Dtype>(param));
+  }
+  if (engine == ConvolutionParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new ConvolutionLayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == ConvolutionParameter_Engine_CUDNN) {
-		return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
+  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
+    return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
@@ -43,29 +42,29 @@ REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
 // Get pooling layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
-	PoolingParameter_Engine engine = param.pooling_param().engine();
-	if (engine == PoolingParameter_Engine_DEFAULT) {
-		engine = PoolingParameter_Engine_CAFFE;
+  PoolingParameter_Engine engine = param.pooling_param().engine();
+  if (engine == PoolingParameter_Engine_DEFAULT) {
+    engine = PoolingParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = PoolingParameter_Engine_CUDNN;
+    engine = PoolingParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == PoolingParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new PoolingLayer<Dtype>(param));
+  }
+  if (engine == PoolingParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new PoolingLayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == PoolingParameter_Engine_CUDNN) {
-		PoolingParameter p_param = param.pooling_param();
-		if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
-				param.top_size() > 1) {
-			LOG(INFO) << "CUDNN does not support padding or multiple tops. "
-			<< "Using Caffe's own pooling layer.";
-			return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
-		}
-		return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
+  } else if (engine == PoolingParameter_Engine_CUDNN) {
+    PoolingParameter p_param = param.pooling_param();
+    if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
+        param.top_size() > 1) {
+      LOG(INFO) << "CUDNN does not support padding or multiple tops. "
+      << "Using Caffe's own pooling layer.";
+      return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+    }
+    return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
@@ -73,22 +72,22 @@ REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 // Get relu layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
-	ReLUParameter_Engine engine = param.relu_param().engine();
-	if (engine == ReLUParameter_Engine_DEFAULT) {
-		engine = ReLUParameter_Engine_CAFFE;
+  ReLUParameter_Engine engine = param.relu_param().engine();
+  if (engine == ReLUParameter_Engine_DEFAULT) {
+    engine = ReLUParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = ReLUParameter_Engine_CUDNN;
+    engine = ReLUParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == ReLUParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new ReLULayer<Dtype>(param));
+  }
+  if (engine == ReLUParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new ReLULayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == ReLUParameter_Engine_CUDNN) {
-		return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
+  } else if (engine == ReLUParameter_Engine_CUDNN) {
+    return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(ReLU, GetReLULayer);
@@ -96,22 +95,22 @@ REGISTER_LAYER_CREATOR(ReLU, GetReLULayer);
 // Get sigmoid layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
-	SigmoidParameter_Engine engine = param.sigmoid_param().engine();
-	if (engine == SigmoidParameter_Engine_DEFAULT) {
-		engine = SigmoidParameter_Engine_CAFFE;
+  SigmoidParameter_Engine engine = param.sigmoid_param().engine();
+  if (engine == SigmoidParameter_Engine_DEFAULT) {
+    engine = SigmoidParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = SigmoidParameter_Engine_CUDNN;
+    engine = SigmoidParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == SigmoidParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new SigmoidLayer<Dtype>(param));
+  }
+  if (engine == SigmoidParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new SigmoidLayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == SigmoidParameter_Engine_CUDNN) {
-		return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
+  } else if (engine == SigmoidParameter_Engine_CUDNN) {
+    return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer);
@@ -119,22 +118,22 @@ REGISTER_LAYER_CREATOR(Sigmoid, GetSigmoidLayer);
 // Get softmax layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
-	SoftmaxParameter_Engine engine = param.softmax_param().engine();
-	if (engine == SoftmaxParameter_Engine_DEFAULT) {
-		engine = SoftmaxParameter_Engine_CAFFE;
+  SoftmaxParameter_Engine engine = param.softmax_param().engine();
+  if (engine == SoftmaxParameter_Engine_DEFAULT) {
+    engine = SoftmaxParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = SoftmaxParameter_Engine_CUDNN;
+    engine = SoftmaxParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == SoftmaxParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new SoftmaxLayer<Dtype>(param));
+  }
+  if (engine == SoftmaxParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new SoftmaxLayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == SoftmaxParameter_Engine_CUDNN) {
-		return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
+  } else if (engine == SoftmaxParameter_Engine_CUDNN) {
+    return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer);
@@ -142,22 +141,22 @@ REGISTER_LAYER_CREATOR(Softmax, GetSoftmaxLayer);
 // Get tanh layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
-	TanHParameter_Engine engine = param.tanh_param().engine();
-	if (engine == TanHParameter_Engine_DEFAULT) {
-		engine = TanHParameter_Engine_CAFFE;
+  TanHParameter_Engine engine = param.tanh_param().engine();
+  if (engine == TanHParameter_Engine_DEFAULT) {
+    engine = TanHParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-		engine = TanHParameter_Engine_CUDNN;
+    engine = TanHParameter_Engine_CUDNN;
 #endif
-	}
-	if (engine == TanHParameter_Engine_CAFFE) {
-		return shared_ptr < Layer<Dtype> > (new TanHLayer<Dtype>(param));
+  }
+  if (engine == TanHParameter_Engine_CAFFE) {
+    return shared_ptr < Layer<Dtype> > (new TanHLayer<Dtype>(param));
 #ifdef USE_CUDNN
-	} else if (engine == TanHParameter_Engine_CUDNN) {
-		return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
+  } else if (engine == TanHParameter_Engine_CUDNN) {
+    return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
 #endif
-	} else {
-		LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
-	}
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+  }
 }
 
 REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
@@ -165,15 +164,15 @@ REGISTER_LAYER_CREATOR(TanH, GetTanHLayer);
 #ifdef WITH_PYTHON_LAYER
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPythonLayer(const LayerParameter& param) {
-	Py_Initialize();
-	try {
-		bp::object module = bp::import(param.python_param().module().c_str());
-		bp::object layer = module.attr(param.python_param().layer().c_str())(param);
-		return bp::extract<shared_ptr<PythonLayer<Dtype> > >(layer)();
-	} catch (bp::error_already_set) {
-		PyErr_Print();
-		throw;
-	}
+  Py_Initialize();
+  try {
+    bp::object module = bp::import(param.python_param().module().c_str());
+    bp::object layer = module.attr(param.python_param().layer().c_str())(param);
+    return bp::extract<shared_ptr<PythonLayer<Dtype> > >(layer)();
+  } catch (bp::error_already_set) {
+    PyErr_Print();
+    throw;
+  }
 }
 
 REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 5dc99b75..945162af 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -8,53 +8,53 @@ namespace caffe {
 
 template <typename Dtype>
 void AbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
-			"allow in-place computation.";
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
+      "allow in-place computation.";
 }
 
 template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	const int count = top[0]->count();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	caffe_abs(count, bottom[0]->cpu_data(), top_data);
+void AbsValLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  caffe_abs(count, bottom[0]->cpu_data(), top_data);
 }
 
 template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const int count = top[0]->count();
-	const Dtype* top_diff = top[0]->cpu_diff();
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		caffe_cpu_sign(count, bottom_data, bottom_diff);
-		caffe_mul(count, bottom_diff, top_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int count = top[0]->count();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_cpu_sign(count, bottom_data, bottom_diff);
+    caffe_mul(count, bottom_diff, top_diff, bottom_diff);
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int count = top[0]->count();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
 }
 
 template <typename Dtype>
 void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const int count = top[0]->count();
-	const Dtype* top_diff = top[0]->gpu_diff();
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->gpu_data();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		caffe_gpu_sign(count, bottom_data, bottom_diff);
-		caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int count = top[0]->count();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_sign(count, bottom_data, bottom_diff);
+    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index a26839d4..4cfc96f8 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -11,78 +11,76 @@
 namespace caffe {
 
 template <typename Dtype>
-void AccuracyLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	top_k_ = this->layer_param_.accuracy_param().top_k();
+void AccuracyLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  top_k_ = this->layer_param_.accuracy_param().top_k();
 
-	has_ignore_label_ =
-			this->layer_param_.accuracy_param().has_ignore_label();
-	if (has_ignore_label_) {
-		ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
-	}
+  has_ignore_label_ = this->layer_param_.accuracy_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
+  }
 }
 
 template <typename Dtype>
-void AccuracyLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
-			<< "top_k must be less than or equal to the number of classes.";
-	label_axis_ =
-			bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
-	outer_num_ = bottom[0]->count(0, label_axis_);
-	inner_num_ = bottom[0]->count(label_axis_ + 1);
-	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-			<< "Number of labels must match number of predictions; "
-			<< "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
-			<< "label count (number of labels) must be N*H*W, "
-			<< "with integer values in {0, 1, ..., C-1}.";
-	vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
-	top[0]->Reshape(top_shape);
+void AccuracyLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
+      << "top_k must be less than or equal to the number of classes.";
+  label_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.accuracy_param().axis());
+  outer_num_ = bottom[0]->count(0, label_axis_);
+  inner_num_ = bottom[0]->count(label_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if label axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
+  top[0]->Reshape(top_shape);
 }
 
 template <typename Dtype>
 void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	Dtype accuracy = 0;
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* bottom_label = bottom[1]->cpu_data();
-	const int dim = bottom[0]->count() / outer_num_;
-	const int num_labels = bottom[0]->shape(label_axis_);
-	vector < Dtype > maxval(top_k_ + 1);
-	vector<int> max_id(top_k_ + 1);
-	int count = 0;
-	for (int i = 0; i < outer_num_; ++i) {
-		for (int j = 0; j < inner_num_; ++j) {
-			const int label_value =
-					static_cast<int>(bottom_label[i * inner_num_ + j]);
-			if (has_ignore_label_ && label_value == ignore_label_) {
-				continue;
-			}
-			DCHECK_GE(label_value, 0);
-			DCHECK_LT(label_value, num_labels);
-			// Top-k accuracy
-			std::vector < std::pair<Dtype, int> > bottom_data_vector;
-			for (int k = 0; k < num_labels; ++k) {
-				bottom_data_vector.push_back(std::make_pair(
-						bottom_data[i * dim + k * inner_num_ + j], k));
-			}
-			std::partial_sort(
-					bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-					bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-			// check if true label is in top k predictions
-			for (int k = 0; k < top_k_; k++) {
-				if (bottom_data_vector[k].second == label_value) {
-					++accuracy;
-					break;
-				}
-			}
-			++count;
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  Dtype accuracy = 0;
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_label = bottom[1]->cpu_data();
+  const int dim = bottom[0]->count() / outer_num_;
+  const int num_labels = bottom[0]->shape(label_axis_);
+  vector < Dtype > maxval(top_k_ + 1);
+  vector<int> max_id(top_k_ + 1);
+  int count = 0;
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; ++j) {
+      const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels);
+      // Top-k accuracy
+      std::vector < std::pair<Dtype, int> > bottom_data_vector;
+      for (int k = 0; k < num_labels; ++k) {
+        bottom_data_vector.push_back(
+            std::make_pair(bottom_data[i * dim + k * inner_num_ + j], k));
+      }
+      std::partial_sort(bottom_data_vector.begin(),
+          bottom_data_vector.begin() + top_k_, bottom_data_vector.end(),
+          std::greater<std::pair<Dtype, int> >());
+      // check if true label is in top k predictions
+      for (int k = 0; k < top_k_; k++) {
+        if (bottom_data_vector[k].second == label_value) {
+          ++accuracy;
+          break;
+        }
+      }
+      ++count;
+    }
+  }
 
-	// LOG(INFO) << "Accuracy: " << accuracy;
-	top[0]->mutable_cpu_data()[0] = accuracy / count;
-	// Accuracy layer should not be used as a loss function.
+  // LOG(INFO) << "Accuracy: " << accuracy;
+  top[0]->mutable_cpu_data()[0] = accuracy / count;
+  // Accuracy layer should not be used as a loss function.
 }
 
 INSTANTIATE_CLASS (AccuracyLayer);
diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp
index 235e8371..7b37283d 100644
--- a/src/caffe/layers/argmax_layer.cpp
+++ b/src/caffe/layers/argmax_layer.cpp
@@ -10,51 +10,50 @@ namespace caffe {
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	out_max_val_ = this->layer_param_.argmax_param().out_max_val();
-	top_k_ = this->layer_param_.argmax_param().top_k();
-	CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
-	CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
-			<< "top_k must be less than or equal to the number of classes.";
+    const vector<Blob<Dtype>*>& top) {
+  out_max_val_ = this->layer_param_.argmax_param().out_max_val();
+  top_k_ = this->layer_param_.argmax_param().top_k();
+  CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
+  CHECK_LE(top_k_, bottom[0]->count() / bottom[0]->num())
+      << "top_k must be less than or equal to the number of classes.";
 }
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	if (out_max_val_) {
-		// Produces max_ind and max_val
-		top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
-	} else {
-		// Produces only max_ind
-		top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  if (out_max_val_) {
+    // Produces max_ind and max_val
+    top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
+  } else {
+    // Produces only max_ind
+    top[0]->Reshape(bottom[0]->num(), 1, top_k_, 1);
+  }
 }
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	int num = bottom[0]->num();
-	int dim = bottom[0]->count() / bottom[0]->num();
-	for (int i = 0; i < num; ++i) {
-		std::vector < std::pair<Dtype, int> > bottom_data_vector;
-		for (int j = 0; j < dim; ++j) {
-			bottom_data_vector.push_back(
-					std::make_pair(bottom_data[i * dim + j], j));
-		}
-		std::partial_sort(
-				bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-				bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-		for (int j = 0; j < top_k_; ++j) {
-			top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
-		}
-		if (out_max_val_) {
-			for (int j = 0; j < top_k_; ++j) {
-				top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first;
-			}
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  for (int i = 0; i < num; ++i) {
+    std::vector < std::pair<Dtype, int> > bottom_data_vector;
+    for (int j = 0; j < dim; ++j) {
+      bottom_data_vector.push_back(std::make_pair(bottom_data[i * dim + j], j));
+    }
+    std::partial_sort(bottom_data_vector.begin(),
+        bottom_data_vector.begin() + top_k_, bottom_data_vector.end(),
+        std::greater<std::pair<Dtype, int> >());
+    for (int j = 0; j < top_k_; ++j) {
+      top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
+    }
+    if (out_max_val_) {
+      for (int j = 0; j < top_k_; ++j) {
+        top_data[top[0]->offset(i, 1, j)] = bottom_data_vector[j].first;
+      }
+    }
+  }
 }
 
 INSTANTIATE_CLASS (ArgMaxLayer);
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 149b1a21..ee0df02f 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -18,33 +18,31 @@ template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::transMem = clCreat
 
 template <typename Dtype>
 void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) {
-	if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) {
-		ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size;
-		clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem);
-		ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context,
-				CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size,
-				NULL,
-				NULL);
-	}
-	if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) {
-		ConvolutionLayer < Dtype > ::trans_mem_size = trans_size;
-		clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem);
-		ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context,
-				CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size,
-				NULL,
-				NULL);
-	}
+  if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) {
+    ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size;
+    clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem);
+    ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context,
+        CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size,
+        NULL, NULL);
+  }
+  if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) {
+    ConvolutionLayer < Dtype > ::trans_mem_size = trans_size;
+    clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem);
+    ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context,
+        CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size,
+        NULL, NULL);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::ocl_setup() {
-	M_ = num_output_ / group_;
-	K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
-	N_ = height_out_ * width_out_;
+  M_ = num_output_ / group_;
+  K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
+  N_ = height_out_ * width_out_;
 #ifdef use_packing_scheme
-	size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
-	size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
-	Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
+  size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
+  size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
+  Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
 #endif
 }
 
@@ -54,428 +52,417 @@ BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer() {
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	// Configure the kernel size, padding, stride, and inputs.
-	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-	CHECK(!conv_param.has_kernel_size() !=
-			!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-	CHECK(conv_param.has_kernel_size() ||
-			(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-			<< "For non-square filters both kernel_h and kernel_w are required.";
-	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-			&& conv_param.has_pad_w())
-			|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-			<< "pad is pad OR pad_h and pad_w are required.";
-	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-			&& conv_param.has_stride_w())
-			|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-			<< "Stride is stride OR stride_h and stride_w are required.";
-	if (conv_param.has_kernel_size()) {
-		kernel_h_ = kernel_w_ = conv_param.kernel_size();
-	} else {
-		kernel_h_ = conv_param.kernel_h();
-		kernel_w_ = conv_param.kernel_w();
-	}
-	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-	if (!conv_param.has_pad_h()) {
-		pad_h_ = pad_w_ = conv_param.pad();
-	} else {
-		pad_h_ = conv_param.pad_h();
-		pad_w_ = conv_param.pad_w();
-	}
-	if (!conv_param.has_stride_h()) {
-		stride_h_ = stride_w_ = conv_param.stride();
-	} else {
-		stride_h_ = conv_param.stride_h();
-		stride_w_ = conv_param.stride_w();
-	}
-	// Special case: im2col is the identity for 1x1 convolution with stride 1
-	// and no padding, so flag for skipping the buffer and transformation.
-	is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
-			&& stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
-	// Configure output channels and groups.
-	channels_ = bottom[0]->channels();
-	num_output_ = this->layer_param_.convolution_param().num_output();
-	CHECK_GT(num_output_, 0);
-	group_ = this->layer_param_.convolution_param().group();
-	CHECK_EQ(channels_ % group_, 0);
-	CHECK_EQ(num_output_ % group_, 0)
-			<< "Number of output should be multiples of group.";
-	if (reverse_dimensions()) {
-		conv_out_channels_ = channels_;
-		conv_in_channels_ = num_output_;
-	} else {
-		conv_out_channels_ = num_output_;
-		conv_in_channels_ = channels_;
-	}
-
-	// Handle the parameters: weights and biases.
-	// - blobs_[0] holds the filter weights
-	// - blobs_[1] holds the biases (optional)
-	bias_term_ = this->layer_param_.convolution_param().bias_term();
-	if (this->blobs_.size() > 0) {
-		LOG(INFO) << "Skipping parameter initialization";
-	} else {
-		if (bias_term_) {
-			this->blobs_.resize(2);
-		} else {
-			this->blobs_.resize(1);
-		}
-		// Initialize and fill the weights:
-		// output channels x input channels per-group x kernel height x kernel width
-		this->blobs_[0].reset(new Blob<Dtype>(
-				conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
-		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
-				this->layer_param_.convolution_param().weight_filler()));
-		weight_filler->Fill(this->blobs_[0].get());
-		// If necessary, initialize and fill the biases.
-		if (bias_term_) {
-			vector<int> bias_shape(1, num_output_);
-			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
-					this->layer_param_.convolution_param().bias_filler()));
-			bias_filler->Fill(this->blobs_[1].get());
-		}
-	}
-	// Propagate gradients to the parameters (as directed by backward pass).
-	this->param_propagate_down_.resize(this->blobs_.size(), true);
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  // Configure the kernel size, padding, stride, and inputs.
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(
+      !conv_param.has_kernel_size()
+          != !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(
+      conv_param.has_kernel_size()
+          || (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK(
+      (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w())
+          || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK(
+      (!conv_param.has_stride() && conv_param.has_stride_h()
+          && conv_param.has_stride_w())
+          || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
+  // Special case: im2col is the identity for 1x1 convolution with stride 1
+  // and no padding, so flag for skipping the buffer and transformation.
+  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 && stride_h_ == 1 && stride_w_ == 1
+      && pad_h_ == 0 && pad_w_ == 0;
+  // Configure output channels and groups.
+  channels_ = bottom[0]->channels();
+  num_output_ = this->layer_param_.convolution_param().num_output();
+  CHECK_GT(num_output_, 0);
+  group_ = this->layer_param_.convolution_param().group();
+  CHECK_EQ(channels_ % group_, 0);
+  CHECK_EQ(num_output_ % group_, 0)
+      << "Number of output should be multiples of group.";
+  if (reverse_dimensions()) {
+    conv_out_channels_ = channels_;
+    conv_in_channels_ = num_output_;
+  } else {
+    conv_out_channels_ = num_output_;
+    conv_in_channels_ = channels_;
+  }
+
+  // Handle the parameters: weights and biases.
+  // - blobs_[0] holds the filter weights
+  // - blobs_[1] holds the biases (optional)
+  bias_term_ = this->layer_param_.convolution_param().bias_term();
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Initialize and fill the weights:
+    // output channels x input channels per-group x kernel height x kernel width
+    this->blobs_[0].reset(
+        new Blob<Dtype>(conv_out_channels_, conv_in_channels_ / group_,
+            kernel_h_, kernel_w_));
+    shared_ptr < Filler<Dtype>
+        > weight_filler(
+            GetFiller < Dtype
+                > (this->layer_param_.convolution_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, initialize and fill the biases.
+    if (bias_term_) {
+      vector<int> bias_shape(1, num_output_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+      shared_ptr < Filler<Dtype>
+          > bias_filler(
+              GetFiller < Dtype
+                  > (this->layer_param_.convolution_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }
+  // Propagate gradients to the parameters (as directed by backward pass).
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	num_ = bottom[0]->num();
-	height_ = bottom[0]->height();
-	width_ = bottom[0]->width();
-	CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-			" convolution kernel.";
-	// TODO: generalize to handle inputs of different shapes.
-	for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
-		CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
-		CHECK_EQ(channels_, bottom[bottom_id]->channels())
-				<< "Inputs must have same channels.";
-		CHECK_EQ(height_, bottom[bottom_id]->height())
-				<< "Inputs must have same height.";
-		CHECK_EQ(width_, bottom[bottom_id]->width())
-				<< "Inputs must have same width.";
-	}
-	// Shape the tops.
-	compute_output_shape();
-	for (int top_id = 0; top_id < top.size(); ++top_id) {
-		top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
-	}
-	if (reverse_dimensions()) {
-		conv_in_height_ = height_out_;
-		conv_in_width_ = width_out_;
-		conv_out_spatial_dim_ = height_ * width_;
-	} else {
-		conv_in_height_ = height_;
-		conv_in_width_ = width_;
-		conv_out_spatial_dim_ = height_out_ * width_out_;
-	}
-	kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
-	weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
-	col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
-	output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
-	// The im2col result buffer will only hold one image at a time to avoid
-	// overly large memory usage. In the special case of 1x1 convolution
-	// it goes lazily unused to save memory.
-	if (reverse_dimensions()) {
-		col_buffer_.Reshape(1, kernel_dim_, height_, width_);
-	} else {
-		col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
-	}
-	// Set up the all ones "bias multiplier" for adding biases by BLAS
-	if (bias_term_) {
-		vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
-		bias_multiplier_.Reshape(bias_multiplier_shape);
-		caffe_set(bias_multiplier_.count(), Dtype(1),
-				bias_multiplier_.mutable_cpu_data());
-	}
-	//initializa OpenCL kernels and cl_mem objects
-	ocl_setup();
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  num_ = bottom[0]->num();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
+      " convolution kernel.";
+  // TODO: generalize to handle inputs of different shapes.
+  for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
+    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
+    CHECK_EQ(channels_, bottom[bottom_id]->channels())
+        << "Inputs must have same channels.";
+    CHECK_EQ(height_, bottom[bottom_id]->height())
+        << "Inputs must have same height.";
+    CHECK_EQ(width_, bottom[bottom_id]->width())
+        << "Inputs must have same width.";
+  }
+  // Shape the tops.
+  compute_output_shape();
+  for (int top_id = 0; top_id < top.size(); ++top_id) {
+    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+  }
+  if (reverse_dimensions()) {
+    conv_in_height_ = height_out_;
+    conv_in_width_ = width_out_;
+    conv_out_spatial_dim_ = height_ * width_;
+  } else {
+    conv_in_height_ = height_;
+    conv_in_width_ = width_;
+    conv_out_spatial_dim_ = height_out_ * width_out_;
+  }
+  kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
+  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
+  col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
+  output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+  // The im2col result buffer will only hold one image at a time to avoid
+  // overly large memory usage. In the special case of 1x1 convolution
+  // it goes lazily unused to save memory.
+  if (reverse_dimensions()) {
+    col_buffer_.Reshape(1, kernel_dim_, height_, width_);
+  } else {
+    col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
+  }
+  // Set up the all ones "bias multiplier" for adding biases by BLAS
+  if (bias_term_) {
+    vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
+    bias_multiplier_.Reshape(bias_multiplier_shape);
+    caffe_set(bias_multiplier_.count(), Dtype(1),
+        bias_multiplier_.mutable_cpu_data());
+  }
+  //initializa OpenCL kernels and cl_mem objects
+  ocl_setup();
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
-		const Dtype* weights, Dtype* output, bool skip_im2col) {
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		if (!skip_im2col) {
-			conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
-		}
-		col_buff = col_buffer_.cpu_data();
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-				group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-				(Dtype) 1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-				(Dtype) 0., output + output_offset_ * g);
-	}
+    const Dtype* weights, Dtype* output, bool skip_im2col) {
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    if (!skip_im2col) {
+      conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
+    }
+    col_buff = col_buffer_.cpu_data();
+  }
+  for (int g = 0; g < group_; ++g) {
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
+            / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff
+            + col_offset_ * g, (Dtype) 0., output + output_offset_ * g);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
-		const Dtype* bias) {
-	caffe_cpu_gemm < Dtype
-			> (CblasNoTrans, CblasNoTrans, num_output_,
-					height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(),
-					(Dtype) 1., output);
+    const Dtype* bias) {
+  caffe_cpu_gemm < Dtype
+      > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), (Dtype) 1., output);
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
-		const Dtype* weights, Dtype* input) {
-	Dtype* col_buff = col_buffer_.mutable_cpu_data();
-	if (is_1x1_) {
-		col_buff = input;
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-				conv_out_spatial_dim_, conv_out_channels_ / group_,
-				(Dtype) 1., weights + weight_offset_ * g, output + output_offset_ * g,
-				(Dtype) 0., col_buff + col_offset_ * g);
-	}
-	if (!is_1x1_) {
-		conv_col2im_cpu(col_buff, input);
-	}
+    const Dtype* weights, Dtype* input) {
+  Dtype* col_buff = col_buffer_.mutable_cpu_data();
+  if (is_1x1_) {
+    col_buff = input;
+  }
+  for (int g = 0; g < group_; ++g) {
+    caffe_cpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_
+            / group_, (Dtype) 1., weights + weight_offset_ * g, output
+            + output_offset_ * g, (Dtype) 0., col_buff + col_offset_ * g);
+  }
+  if (!is_1x1_) {
+    conv_col2im_cpu(col_buff, input);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
-		const Dtype* output, Dtype* weights) {
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
-		col_buff = col_buffer_.cpu_data();
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_cpu_gemm < Dtype
-				> (CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-						kernel_dim_ / group_, conv_out_spatial_dim_,
-						(Dtype) 1., output + output_offset_ * g, col_buff + col_offset_ * g,
-						(Dtype) 1., weights + weight_offset_ * g);
-	}
+    const Dtype* output, Dtype* weights) {
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
+    col_buff = col_buffer_.cpu_data();
+  }
+  for (int g = 0; g < group_; ++g) {
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_
+            / group_, conv_out_spatial_dim_, (Dtype) 1., output
+            + output_offset_ * g, col_buff + col_offset_ * g, (Dtype) 1., weights
+            + weight_offset_ * g);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
-		const Dtype* input) {
-	caffe_cpu_gemv < Dtype
-			> (CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-					input, bias_multiplier_.cpu_data(), 1., bias);
+    const Dtype* input) {
+  caffe_cpu_gemv < Dtype
+      > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
 #ifndef CPU_ONLY
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
-		const Dtype* weights, Dtype* output, bool skip_im2col) {
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		if (!skip_im2col) {
-			conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-		}
-		col_buff = col_buffer_.gpu_data();
-	}
-
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans,
-						conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
-								/ group_,
-						(Dtype) 1., weights, weight_offset_ * g, col_buff, col_offset_ * g,
-						(Dtype) 0., output, top_offset_ + output_offset_ * g);
-	}
+    const Dtype* weights, Dtype* output, bool skip_im2col) {
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    if (!skip_im2col) {
+      conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+    }
+    col_buff = col_buffer_.gpu_data();
+  }
+
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm < Dtype
+        > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_
+            / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_
+            * g, col_buff, col_offset_ * g, (Dtype) 0., output, top_offset_
+            + output_offset_ * g);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
-		const Dtype* bias) {
-	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-			height_out_ * width_out_, 1, (Dtype) 1., bias, 0,
-			reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-			(Dtype) 1., output, top_offset_);
+    const Dtype* bias) {
+  caffe_gpu_gemm < Dtype
+      > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, 0, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_);
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
-		const Dtype* weights, Dtype* input) {
-	Dtype* col_buff = col_buffer_.mutable_gpu_data();
-	if (is_1x1_) {
-		col_buff = input;
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
-						/ group_, conv_out_spatial_dim_, conv_out_channels_ / group_,
-						(Dtype) 1., weights, weight_offset_ * g,
-						output, top_offset_ + output_offset_ * g,
-						(Dtype) 0., col_buff, col_offset_ * g);
-	}
-	if (!is_1x1_) {
-		conv_col2im_gpu(col_buff, input);
-	}
+    const Dtype* weights, Dtype* input) {
+  Dtype* col_buff = col_buffer_.mutable_gpu_data();
+  if (is_1x1_) {
+    col_buff = input;
+  }
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm < Dtype
+        > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+            / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_
+            * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, col_offset_
+            * g);
+  }
+  if (!is_1x1_) {
+    conv_col2im_gpu(col_buff, input);
+  }
 }
 
-
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
-		const Dtype* output, Dtype* weights) {
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
-		col_buff = col_buffer_.gpu_data();
-	}
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
-						/ group_, kernel_dim_ / group_, conv_out_spatial_dim_,
-						(Dtype) 1., output, top_offset_,
-						(Dtype*) col_buff, col_offset_ * g, (Dtype) 1.,
-						(Dtype*) weights, weight_offset_ * g);
-	}
+    const Dtype* output, Dtype* weights) {
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
+    col_buff = col_buffer_.gpu_data();
+  }
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm < Dtype
+        > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
+            / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_, (Dtype*) col_buff, col_offset_
+            * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
-		const Dtype* input) {
-	caffe_gpu_gemv < Dtype
-			> (CblasNoTrans, num_output_, N_,
-					(Dtype) 1., input, top_offset_, N_,
-					reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1,
-					bias, (size_t) 0, 1);
+    const Dtype* input) {
+  caffe_gpu_gemv < Dtype
+      > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1);
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
-		const Dtype* weight, Dtype* output, bool skip_im2col) {
-	cl_command_queue Queue;
-	const Dtype* col_buff = input;
-	if (!is_1x1_) {
-		if (!skip_im2col) {
-			conv_im2col_gpu_opt(input);
-		}
-		col_buff = col_buffer_.gpu_data();
-	} else {
-		caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff,
-				(Dtype*) transMem);
-	}
+    const Dtype* weight, Dtype* output, bool skip_im2col) {
+  cl_command_queue Queue;
+  const Dtype* col_buff = input;
+  if (!is_1x1_) {
+    if (!skip_im2col) {
+      conv_im2col_gpu_opt(input);
+    }
+    col_buff = col_buffer_.gpu_data();
+  } else {
+    caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff,
+        (Dtype*) transMem);
+  }
 #ifdef multiQ
-	for (int g = 0; g < group_; ++g) {
-		if(g == 0) Queue = amdDevice.CommandQueue;
-		else Queue = amdDevice.CommandQueue_helper;
-		caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-				(Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
-				(Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
-	}
-	if(group_ == 2) {
-		clFinish(amdDevice.CommandQueue);
-		clFinish(amdDevice.CommandQueue_helper);
-	}
+  for (int g = 0; g < group_; ++g) {
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
+    caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+        (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+        (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
+  }
+  if(group_ == 2) {
+    clFinish(amdDevice.CommandQueue);
+    clFinish(amdDevice.CommandQueue_helper);
+  }
 #else
-	Queue = amdDevice.CommandQueue;
-	for (int g = 0; g < group_; ++g) {
-		caffe_gpu_gemm < Dtype
-				> (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
-						(Dtype) 1., weight, weight_offset_ * g, (Dtype*) transMem, col_offset_
-								* g,
-						(Dtype) 0., (Dtype*) subTopMem, top_offset_opt * g);
-	}
+  Queue = amdDevice.CommandQueue;
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype) 1., weight, weight_offset_
+            * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 0., (Dtype*) subTopMem, top_offset_opt
+            * g);
+  }
 #endif
-	transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_,
-			opt_num2);
+  transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_,
+      opt_num2);
 }
 
-
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
-		const Dtype* bias) {
-	for (int z = 0; z < opt_num2; z++)
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num_output_,
-				N_, 1, (Dtype) 1., bias, 0,
-				reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0,
-				(Dtype) 1., output, top_offset_ + num_output_ * N_ * z);
+    const Dtype* bias) {
+  for (int z = 0; z < opt_num2; z++)
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype) 1., bias, 0, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_
+            + num_output_ * N_ * z);
 }
 
-
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
-		const Dtype* weights, Dtype* input) {
-	cl_command_queue Queue;
-	if (is_1x1_) {
-		caffe_gpu_memcpy(
-				height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
-				(Dtype*) transMem);
-	}
-	for (int g = 0; g < group_; ++g) {
+    const Dtype* weights, Dtype* input) {
+  cl_command_queue Queue;
+  if (is_1x1_) {
+    caffe_gpu_memcpy(
+        height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
+        (Dtype*) transMem);
+  }
+  for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
-		if(g == 0) Queue = amdDevice.CommandQueue;
-		else Queue = amdDevice.CommandQueue_helper;
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
 #else
-		Queue = amdDevice.CommandQueue;
+    Queue = amdDevice.CommandQueue;
 #endif
-		caffe_gpu_gemm < Dtype
-				> (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_,
-						(Dtype) 1., weights, weight_offset_ * g,
-						(Dtype*) subTopMem, top_offset_opt * g,
-						(Dtype) 0., (Dtype*) transMem, col_offset_ * g);
-	}
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype) 1., weights, weight_offset_
+            * g, (Dtype*) subTopMem, top_offset_opt * g, (Dtype) 0., (Dtype*) transMem, col_offset_
+            * g);
+  }
 #ifdef multiQ
-	if(group_ ==2) {
-		clFinish(amdDevice.CommandQueue);
-		clFinish(amdDevice.CommandQueue_helper);
-	}
+  if(group_ ==2) {
+    clFinish(amdDevice.CommandQueue);
+    clFinish(amdDevice.CommandQueue_helper);
+  }
 #endif
 
-	if (!is_1x1_) {
-		conv_col2im_gpu_opt(input);
-	} else {
-		caffe_gpu_memcpy(
-				height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
-				(Dtype*) transMem, input);
-	}
+  if (!is_1x1_) {
+    conv_col2im_gpu_opt(input);
+  } else {
+    caffe_gpu_memcpy(
+        height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
+        (Dtype*) transMem, input);
+  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
-		const Dtype* output, Dtype* weights) {
-	cl_command_queue Queue;
-	if (!is_1x1_) {
-		conv_im2col_gpu_opt(input);
-	} else {
-		caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input,
-				(Dtype*) transMem);
-	}
-	opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
-			opt_num2);
-
-	for (int g = 0; g < group_; ++g) {
+    const Dtype* output, Dtype* weights) {
+  cl_command_queue Queue;
+  if (!is_1x1_) {
+    conv_im2col_gpu_opt(input);
+  } else {
+    caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input,
+        (Dtype*) transMem);
+  }
+  opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+      opt_num2);
+
+  for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
-		if(g == 0) Queue = amdDevice.CommandQueue;
-		else Queue = amdDevice.CommandQueue_helper;
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
 #else
-		Queue = amdDevice.CommandQueue;
+    Queue = amdDevice.CommandQueue;
 #endif
-		caffe_gpu_gemm < Dtype
-				> (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2,
-						(Dtype) 1., (Dtype*) subTopMem, top_offset_opt * g,
-						(Dtype*) transMem, col_offset_ * g, (Dtype) 1.,
-						(Dtype*) weights, weight_offset_ * g);
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, (Dtype) 1., (Dtype*) subTopMem, top_offset_opt
+            * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_
+            * g);
 #ifdef multiQ
-		if(group_ == 2) {
-			clFinish(amdDevice.CommandQueue);
-			clFinish(amdDevice.CommandQueue_helper);
-		}
+    if(group_ == 2) {
+      clFinish(amdDevice.CommandQueue);
+      clFinish(amdDevice.CommandQueue_helper);
+    }
 #endif
-	}
+  }
 }
 
 // end: code is written/modified by AMD
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index b0c0ebf2..d02e92c4 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -9,111 +9,107 @@ namespace caffe {
 
 template <typename Dtype>
 BaseDataLayer<Dtype>::BaseDataLayer(const LayerParameter& param)
-:
-		Layer<Dtype>(param),
-				transform_param_(param.transform_param()) {
+    : Layer<Dtype>(param), transform_param_(param.transform_param()) {
 }
 
 template <typename Dtype>
 void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	if (top.size() == 1) {
-		output_labels_ = false;
-	} else {
-		output_labels_ = true;
-	}
-	data_transformer_.reset(
-			new DataTransformer<Dtype>(transform_param_, this->phase_));
-	data_transformer_->InitRand();
-	// The subclasses should setup the size of bottom and top
-	DataLayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  if (top.size() == 1) {
+    output_labels_ = false;
+  } else {
+    output_labels_ = true;
+  }
+  data_transformer_.reset(
+      new DataTransformer<Dtype>(transform_param_, this->phase_));
+  data_transformer_->InitRand();
+  // The subclasses should setup the size of bottom and top
+  DataLayerSetUp(bottom, top);
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	BaseDataLayer < Dtype > ::LayerSetUp(bottom, top);
-	// Now, start the prefetch thread. Before calling prefetch, we make two
-	// cpu_data calls so that the prefetch thread does not accidentally make
-	// simultaneous cudaMalloc calls when the main thread is running. In some
-	// GPUs this seems to cause failures if we do not so.
-	this->prefetch_data_.mutable_cpu_data();
-	if (this->output_labels_) {
-		this->prefetch_label_.mutable_cpu_data();
-	}
-	DLOG(INFO) << "Initializing prefetch";
-	this->CreatePrefetchThread();
-	DLOG(INFO) << "Prefetch initialized.";
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  BaseDataLayer < Dtype > ::LayerSetUp(bottom, top);
+  // Now, start the prefetch thread. Before calling prefetch, we make two
+  // cpu_data calls so that the prefetch thread does not accidentally make
+  // simultaneous cudaMalloc calls when the main thread is running. In some
+  // GPUs this seems to cause failures if we do not so.
+  this->prefetch_data_.mutable_cpu_data();
+  if (this->output_labels_) {
+    this->prefetch_label_.mutable_cpu_data();
+  }
+  DLOG(INFO) << "Initializing prefetch";
+  this->CreatePrefetchThread();
+  DLOG(INFO) << "Prefetch initialized.";
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::CreatePrefetchThread() {
-	this->data_transformer_->InitRand();
-	CHECK(StartInternalThread()) << "Thread execution failed";
+  this->data_transformer_->InitRand();
+  CHECK(StartInternalThread()) << "Thread execution failed";
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::JoinPrefetchThread() {
-	CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
+  CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// First, join the thread
-	JoinPrefetchThread();
-
-	DLOG(INFO) << "Thread joined";
-	// Reshape to loaded data.
-	top[0]->ReshapeLike(prefetch_data_);
-	// Copy the data
-	caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-			top[0]->mutable_cpu_data());
-	DLOG(INFO) << "Prefetch copied";
-	if (this->output_labels_) {
-		// Reshape to loaded labels.
-		top[1]->ReshapeLike(prefetch_label_);
-		// Copy the labels.
-		caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-				top[1]->mutable_cpu_data());
-	}
-	// Start a new prefetch thread
-	DLOG(INFO) << "CreatePrefetchThread";
-	CreatePrefetchThread();
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // First, join the thread
+  JoinPrefetchThread();
+
+  DLOG(INFO) << "Thread joined";
+  // Reshape to loaded data.
+  top[0]->ReshapeLike(prefetch_data_);
+  // Copy the data
+  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
+      top[0]->mutable_cpu_data());
+  DLOG(INFO) << "Prefetch copied";
+  if (this->output_labels_) {
+    // Reshape to loaded labels.
+    top[1]->ReshapeLike(prefetch_label_);
+    // Copy the labels.
+    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
+        top[1]->mutable_cpu_data());
+  }
+  // Start a new prefetch thread
+  DLOG(INFO) << "CreatePrefetchThread";
+  CreatePrefetchThread();
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-		const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-
-	JoinPrefetchThread();
-	DLOG(INFO) << "Thread joined";
-
-	top[0]->ReshapeLike(this->prefetch_data_);
-	OCL_CHECK(
-			clEnqueueWriteBuffer(amdDevice.CommandQueue,
-					(cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
-					sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
-					NULL, NULL));
-	DLOG(INFO) << "Prefetch copied";
-	if (this->output_labels_) {
-		// Reshape to loaded labels.
-		top[1]->ReshapeLike(prefetch_label_);
-		OCL_CHECK(
-				clEnqueueWriteBuffer(amdDevice.CommandQueue,
-						(cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
-						sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(),
-						0,
-						NULL, NULL));
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+
+  JoinPrefetchThread();
+  DLOG(INFO) << "Thread joined";
+
+  top[0]->ReshapeLike(this->prefetch_data_);
+  OCL_CHECK(
+      clEnqueueWriteBuffer(amdDevice.CommandQueue,
+          (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
+          sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
+          NULL, NULL));
+  DLOG(INFO) << "Prefetch copied";
+  if (this->output_labels_) {
+    // Reshape to loaded labels.
+    top[1]->ReshapeLike(prefetch_label_);
+    OCL_CHECK(
+        clEnqueueWriteBuffer(amdDevice.CommandQueue,
+            (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
+            sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(),
+            0, NULL, NULL));
+  }
 
 #ifdef Track_data_transfer
 #endif
 
-	// Start a new prefetch thread
-	DLOG(INFO) << "CreatePrefetchThread";
-	CreatePrefetchThread();
+  // Start a new prefetch thread
+  DLOG(INFO) << "CreatePrefetchThread";
+  CreatePrefetchThread();
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index ad422a11..c2cce9e3 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -10,58 +10,56 @@ const float kBNLL_THRESHOLD = 50.;
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	for (int i = 0; i < count; ++i) {
-		top_data[i] =
-				bottom_data[i] > 0 ?
-															bottom_data[i] + log(1. + exp(-bottom_data[i])) :
-															log(1. + exp(bottom_data[i]));
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] =
+        bottom_data[i] > 0 ?
+            bottom_data[i] + log(1. + exp(-bottom_data[i])) :
+            log(1. + exp(bottom_data[i]));
+  }
 }
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		const Dtype* top_diff = top[0]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const int count = bottom[0]->count();
-		Dtype expval;
-		for (int i = 0; i < count; ++i) {
-			expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
-			bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype expval;
+    for (int i = 0; i < count; ++i) {
+      expval = exp(std::min(bottom_data[i], Dtype(kBNLL_THRESHOLD)));
+      bottom_diff[i] = top_diff[i] * expval / (expval + 1.);
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	BNLLForward(count, bottom_data, top_data);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  BNLLForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->gpu_data();
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const int count = bottom[0]->count();
-		// NOLINT_NEXT_LINE(whitespace/operators)
-		BNLLBackward(count, top_diff, bottom_data, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    BNLLBackward(count, top_diff, bottom_data, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 28aac6b2..5a351009 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -8,133 +8,135 @@ namespace caffe {
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const ConcatParameter& concat_param = this->layer_param_.concat_param();
-	CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
-			<< "Either axis or concat_dim should be specified; not both.";
+    const vector<Blob<Dtype>*>& top) {
+  const ConcatParameter& concat_param = this->layer_param_.concat_param();
+  CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
+      << "Either axis or concat_dim should be specified; not both.";
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int num_axes = bottom[0]->num_axes();
-	const ConcatParameter& concat_param = this->layer_param_.concat_param();
-	if (concat_param.has_concat_dim()) {
-		concat_axis_ = static_cast<int>(concat_param.concat_dim());
-		// Don't allow negative indexing for concat_dim, a uint32 -- almost
-		// certainly unintended.
-		CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
-				<< "produced negative result; concat_dim must satisfy "
-				<< "0 <= concat_dim < " << kMaxBlobAxes;
-		CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
-	} else {
-		concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
-	}
-	// Initialize with the first blob.
-	vector<int> top_shape = bottom[0]->shape();
-	num_concats_ = bottom[0]->count(0, concat_axis_);
-	concat_input_size_ = bottom[0]->count(concat_axis_ + 1);
-	int bottom_count_sum = bottom[0]->count();
-	for (int i = 1; i < bottom.size(); ++i) {
-		CHECK_EQ(num_axes, bottom[i]->num_axes())
-				<< "All inputs must have the same #axes.";
-		for (int j = 0; j < num_axes; ++j) {
-			if (j == concat_axis_) {
-				continue;
-			}
-			CHECK_EQ(top_shape[j], bottom[i]->shape(j))
-					<< "All inputs must have the same shape, except at concat_axis.";
-		}
-		bottom_count_sum += bottom[i]->count();
-		top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
-	}
-	top[0]->Reshape(top_shape);
-	CHECK_EQ(bottom_count_sum, top[0]->count());
+    const vector<Blob<Dtype>*>& top) {
+  const int num_axes = bottom[0]->num_axes();
+  const ConcatParameter& concat_param = this->layer_param_.concat_param();
+  if (concat_param.has_concat_dim()) {
+    concat_axis_ = static_cast<int>(concat_param.concat_dim());
+    // Don't allow negative indexing for concat_dim, a uint32 -- almost
+    // certainly unintended.
+    CHECK_GE(concat_axis_, 0) << "casting concat_dim from uint32 to int32 "
+        << "produced negative result; concat_dim must satisfy "
+        << "0 <= concat_dim < " << kMaxBlobAxes;
+    CHECK_LT(concat_axis_, num_axes) << "concat_dim out of range.";
+  } else {
+    concat_axis_ = bottom[0]->CanonicalAxisIndex(concat_param.axis());
+  }
+  // Initialize with the first blob.
+  vector<int> top_shape = bottom[0]->shape();
+  num_concats_ = bottom[0]->count(0, concat_axis_);
+  concat_input_size_ = bottom[0]->count(concat_axis_ + 1);
+  int bottom_count_sum = bottom[0]->count();
+  for (int i = 1; i < bottom.size(); ++i) {
+    CHECK_EQ(num_axes, bottom[i]->num_axes())
+        << "All inputs must have the same #axes.";
+    for (int j = 0; j < num_axes; ++j) {
+      if (j == concat_axis_) {
+        continue;
+      }
+      CHECK_EQ(top_shape[j], bottom[i]->shape(j))
+          << "All inputs must have the same shape, except at concat_axis.";
+    }
+    bottom_count_sum += bottom[i]->count();
+    top_shape[concat_axis_] += bottom[i]->shape(concat_axis_);
+  }
+  top[0]->Reshape(top_shape);
+  CHECK_EQ(bottom_count_sum, top[0]->count());
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	int offset_concat_axis = 0;
-	const int top_concat_axis = top[0]->shape(concat_axis_);
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->cpu_data();
-		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-		for (int n = 0; n < num_concats_; ++n) {
-			caffe_copy(bottom_concat_axis * concat_input_size_,
-					bottom_data + n * bottom_concat_axis * concat_input_size_,
-					top_data + (n * top_concat_axis + offset_concat_axis)
-							* concat_input_size_);
-		}
-		offset_concat_axis += bottom_concat_axis;
-	}
+    const vector<Blob<Dtype>*>& top) {
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_,
+          bottom_data + n * bottom_concat_axis * concat_input_size_,
+          top_data
+              + (n * top_concat_axis + offset_concat_axis)
+                  * concat_input_size_);
+    }
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->cpu_diff();
-	int offset_concat_axis = 0;
-	const int top_concat_axis = top[0]->shape(concat_axis_);
-	for (int i = 0; i < bottom.size(); ++i) {
-		if (!propagate_down[i]) {
-			continue;
-		}
-		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-		for (int n = 0; n < num_concats_; ++n) {
-			caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
-					(n * top_concat_axis + offset_concat_axis) * concat_input_size_,
-					bottom_diff + n * bottom_concat_axis * concat_input_size_);
-		}
-		offset_concat_axis += bottom_concat_axis;
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (!propagate_down[i]) {
+      continue;
+    }
+    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    for (int n = 0; n < num_concats_; ++n) {
+      caffe_copy(bottom_concat_axis * concat_input_size_,
+          top_diff
+              + (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+          bottom_diff + n * bottom_concat_axis * concat_input_size_);
+    }
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	if (bottom.size() == 1) {
-		return;
-	}
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	int offset_concat_axis = 0;
-	const int top_concat_axis = top[0]->shape(concat_axis_);
-	const bool kForward = true;
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->gpu_data();
-		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-		const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-		const int nthreads = bottom_concat_size * num_concats_;
-		Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-				top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-		offset_concat_axis += bottom_concat_axis;
-	}
+    const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1) {
+    return;
+  }
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = true;
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+    const int nthreads = bottom_concat_size * num_concats_;
+    Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
+        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (bottom.size() == 1) {
-		return;
-	}
-	const Dtype* top_diff = top[0]->gpu_diff();
-	int offset_concat_axis = 0;
-	const int top_concat_axis = top[0]->shape(concat_axis_);
-	const bool kForward = false;
-	for (int i = 0; i < bottom.size(); ++i) {
-		const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-		if (propagate_down[i]) {
-			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-			const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-			const int nthreads = bottom_concat_size * num_concats_;
-			Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-					top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-		}
-		offset_concat_axis += bottom_concat_axis;
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bottom.size() == 1) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = false;
+  for (int i = 0; i < bottom.size(); ++i) {
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    if (propagate_down[i]) {
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+      const int nthreads = bottom_concat_size * num_concats_;
+      Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    }
+    offset_concat_axis += bottom_concat_axis;
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index f6265726..a8e6f523 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -9,173 +9,151 @@
 namespace caffe {
 
 template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::LayerSetUp(bottom, top);
-	CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
-	CHECK_EQ(bottom[0]->height(), 1);
-	CHECK_EQ(bottom[0]->width(), 1);
-	CHECK_EQ(bottom[1]->height(), 1);
-	CHECK_EQ(bottom[1]->width(), 1);
-	CHECK_EQ(bottom[2]->channels(), 1);
-	CHECK_EQ(bottom[2]->height(), 1);
-	CHECK_EQ(bottom[2]->width(), 1);
-	diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
-	diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
-	dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1);
-	// vector of ones used to sum along channels
-	summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
-	for (int i = 0; i < bottom[0]->channels(); ++i)
-		summer_vec_.mutable_cpu_data()[i] = Dtype(1);
+void ContrastiveLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
+  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
+  CHECK_EQ(bottom[0]->height(), 1);
+  CHECK_EQ(bottom[0]->width(), 1);
+  CHECK_EQ(bottom[1]->height(), 1);
+  CHECK_EQ(bottom[1]->width(), 1);
+  CHECK_EQ(bottom[2]->channels(), 1);
+  CHECK_EQ(bottom[2]->height(), 1);
+  CHECK_EQ(bottom[2]->width(), 1);
+  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1);
+  // vector of ones used to sum along channels
+  summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1);
+  for (int i = 0; i < bottom[0]->channels(); ++i)
+    summer_vec_.mutable_cpu_data()[i] = Dtype(1);
 }
 
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int count = bottom[0]->count();
-	caffe_sub(
-			count,
-			bottom[0]->cpu_data(),  // a
-			bottom[1]->cpu_data(),  // b
-			diff_.mutable_cpu_data());  // a_i-b_i
-	const int channels = bottom[0]->channels();
-	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-	bool legacy_version =
-			this->layer_param_.contrastive_loss_param().legacy_version();
-	Dtype loss(0.0);
-	for (int i = 0; i < bottom[0]->num(); ++i) {
-		dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
-				diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
-		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-			loss += dist_sq_.cpu_data()[i];
-		} else {  // dissimilar pairs
-			if (legacy_version) {
-				loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-			} else {
-				Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
-				loss += dist * dist;
-			}
-		}
-	}
-	loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-	top[0]->mutable_cpu_data()[0] = loss;
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_sub(count, bottom[0]->cpu_data(),  // a
+      bottom[1]->cpu_data(),  // b
+      diff_.mutable_cpu_data());  // a_i-b_i
+  const int channels = bottom[0]->channels();
+  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
+  Dtype loss(0.0);
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
+        diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
+    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+      loss += dist_sq_.cpu_data()[i];
+    } else {  // dissimilar pairs
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist * dist;
+      }
+    }
+  }
+  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-	bool legacy_version =
-			this->layer_param_.contrastive_loss_param().legacy_version();
-	for (int i = 0; i < 2; ++i) {
-		if (propagate_down[i]) {
-			const Dtype sign = (i == 0) ? 1 : -1;
-			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-					static_cast<Dtype>(bottom[i]->num());
-			int num = bottom[i]->num();
-			int channels = bottom[i]->channels();
-			for (int j = 0; j < num; ++j) {
-				Dtype* bout = bottom[i]->mutable_cpu_diff();
-				if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
-					caffe_cpu_axpby(
-							channels,
-							alpha,
-							diff_.cpu_data() + (j * channels),
-							Dtype(0.0),
-							bout + (j * channels));
-				} else {  // dissimilar pairs
-					Dtype mdist(0.0);
-					Dtype beta(0.0);
-					if (legacy_version) {
-						mdist = margin - dist_sq_.cpu_data()[j];
-						beta = -alpha;
-					} else {
-						Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
-						mdist = margin - dist;
-						beta = -alpha * mdist / (dist + Dtype(1e-4));
-					}
-					if (mdist > Dtype(0.0)) {
-						caffe_cpu_axpby(
-								channels,
-								beta,
-								diff_.cpu_data() + (j * channels),
-								Dtype(0.0),
-								bout + (j * channels));
-					} else {
-						caffe_set(channels, Dtype(0), bout + (j * channels));
-					}
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0]
+          / static_cast<Dtype>(bottom[i]->num());
+      int num = bottom[i]->num();
+      int channels = bottom[i]->channels();
+      for (int j = 0; j < num; ++j) {
+        Dtype* bout = bottom[i]->mutable_cpu_diff();
+        if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
+          caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j * channels),
+              Dtype(0.0), bout + (j * channels));
+        } else {  // dissimilar pairs
+          Dtype mdist(0.0);
+          Dtype beta(0.0);
+          if (legacy_version) {
+            mdist = margin - dist_sq_.cpu_data()[j];
+            beta = -alpha;
+          } else {
+            Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
+            mdist = margin - dist;
+            beta = -alpha * mdist / (dist + Dtype(1e-4));
+          }
+          if (mdist > Dtype(0.0)) {
+            caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j * channels),
+                Dtype(0.0), bout + (j * channels));
+          } else {
+            caffe_set(channels, Dtype(0), bout + (j * channels));
+          }
+        }
+      }
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	const int count = bottom[0]->count();
-	caffe_gpu_sub(
-			count,
-			bottom[0]->gpu_data(),  // a
-			bottom[1]->gpu_data(),  // b
-			diff_.mutable_gpu_data());  // a_i-b_i
-	caffe_gpu_powx(
-			count,
-			diff_.mutable_gpu_data(),  // a_i-b_i
-			Dtype(2),
-			diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-	caffe_gpu_gemv(
-			CblasNoTrans,
-			bottom[0]->num(),
-			bottom[0]->channels(),
-			Dtype(1.0),
-			diff_sq_.gpu_data(),  // (a_i-b_i)^2
-			summer_vec_.gpu_data(),
-			Dtype(0.0),
-			dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-	Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-	bool legacy_version =
-			this->layer_param_.contrastive_loss_param().legacy_version();
-	Dtype loss(0.0);
-	for (int i = 0; i < bottom[0]->num(); ++i) {
-		if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-			loss += dist_sq_.cpu_data()[i];
-		} else {  // dissimilar pairs
-			if (legacy_version) {
-				loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-			} else {
-				Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
-				loss += dist * dist;
-			}
-		}
-	}
-	loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-	top[0]->mutable_cpu_data()[0] = loss;
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  caffe_gpu_sub(count, bottom[0]->gpu_data(),  // a
+      bottom[1]->gpu_data(),  // b
+      diff_.mutable_gpu_data());  // a_i-b_i
+  caffe_gpu_powx(count, diff_.mutable_gpu_data(),  // a_i-b_i
+      Dtype(2), diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
+  caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(),
+      Dtype(1.0), diff_sq_.gpu_data(),  // (a_i-b_i)^2
+      summer_vec_.gpu_data(), Dtype(0.0), dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2
+  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
+  Dtype loss(0.0);
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+      loss += dist_sq_.cpu_data()[i];
+    } else {  // dissimilar pairs
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist * dist;
+      }
+    }
+  }
+  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	for (int i = 0; i < 2; ++i) {
-		if (propagate_down[i]) {
-			const int count = bottom[0]->count();
-			const int channels = bottom[0]->channels();
-			Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-			const bool legacy_version =
-					this->layer_param_.contrastive_loss_param().legacy_version();
-			const Dtype sign = (i == 0) ? 1 : -1;
-			const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-					static_cast<Dtype>(bottom[0]->num());
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			CLLBackward(count, channels, margin, legacy_version, alpha,
-					bottom[2]->gpu_data(),  // pair similarity 0 or 1
-					diff_.gpu_data(),  // the cached eltwise difference between a and b
-					dist_sq_.gpu_data(),  // the cached square distance between a and b
-					bottom[i]->mutable_gpu_diff());
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const int count = bottom[0]->count();
+      const int channels = bottom[0]->channels();
+      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+      const bool legacy_version =
+          this->layer_param_.contrastive_loss_param().legacy_version();
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0]
+          / static_cast<Dtype>(bottom[0]->num());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      CLLBackward(count, channels, margin, legacy_version, alpha,
+          bottom[2]->gpu_data(),  // pair similarity 0 or 1
+          diff_.gpu_data(),  // the cached eltwise difference between a and b
+          dist_sq_.gpu_data(),  // the cached square distance between a and b
+          bottom[i]->mutable_gpu_diff());
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 0a989f69..9c250c42 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -9,230 +9,221 @@ namespace caffe {
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::compute_output_shape() {
-	this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
-			/ this->stride_h_ + 1;
-	this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
-			/ this->stride_w_ + 1;
+  this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
+      / this->stride_h_ + 1;
+  this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
+      / this->stride_w_ + 1;
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* weight = this->blobs_[0]->cpu_data();
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->cpu_data();
-		Dtype* top_data = top[i]->mutable_cpu_data();
-		for (int n = 0; n < this->num_; ++n) {
-			this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-					top_data + top[i]->offset(n));
-			if (this->bias_term_) {
-				const Dtype* bias = this->blobs_[1]->cpu_data();
-				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
-			}
-		}
-	}
-
-	// CHECK_BLOB_DATA(top[0],20, "top[0]");
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    Dtype* top_data = top[i]->mutable_cpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->cpu_data();
+        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+      }
+    }
+  }
+
+  // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* weight = this->blobs_[0]->cpu_data();
-	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->cpu_diff();
-		const Dtype* bottom_data = bottom[i]->cpu_data();
-		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-		// Bias gradient, if necessary.
-		if (this->bias_term_ && this->param_propagate_down_[1]) {
-			Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
-			for (int n = 0; n < this->num_; ++n) {
-				this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
-			}
-		}
-		if (this->param_propagate_down_[0] || propagate_down[i]) {
-			for (int n = 0; n < this->num_; ++n) {
-				// gradient w.r.t. weight. Note that we will accumulate diffs.
-				if (this->param_propagate_down_[0]) {
-					this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
-							top_diff + top[i]->offset(n), weight_diff);
-				}
-				// gradient w.r.t. bottom data, if necessary.
-				if (propagate_down[i]) {
-					this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-							bottom_diff + bottom[i]->offset(n));
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->cpu_diff();
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      for (int n = 0; n < this->num_; ++n) {
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
+              top_diff + top[i]->offset(n), weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
+        }
+      }
+    }
+  }
 
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	if (use_packing_scheme && global_packing_N > 1)
-		Forward_gpu_opt2(bottom, top);
-	else
-		Forward_gpu_org(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  if (use_packing_scheme && global_packing_N > 1)
+    Forward_gpu_opt2(bottom, top);
+  else
+    Forward_gpu_org(bottom, top);
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (use_packing_scheme && global_packing_N > 1)
-		Backward_gpu_opt2(top, propagate_down, bottom);
-	else
-		Backward_gpu_org(top, propagate_down, bottom);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (use_packing_scheme && global_packing_N > 1)
+    Backward_gpu_opt2(top, propagate_down, bottom);
+  else
+    Backward_gpu_org(top, propagate_down, bottom);
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
-		const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->gpu_data();
-		//CHECK_BLOB_DATA(bottom[i],10,"bottom");
-
-		Dtype* top_data = top[i]->mutable_gpu_data();
-		this->opt_num2 = global_packing_N;
-		this->weight_offset_ = this->M_ * this->K_;
-		for (int n = 0; n < this->num_; n += this->opt_num2) {
-			this->opt_num2 =
-					this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
-			//intermediate variables to pass offset
-			this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
-			this->top_offset_ = top[i]->offset(n);
-			this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
-			this->bottom_offset_ = bottom[i]->offset(n);
-			this->forward_gpu_gemm_opt(bottom_data, weight,
-					top_data);
-			if (this->bias_term_) {
-				const Dtype* bias = this->blobs_[1]->gpu_data();
-				this->forward_gpu_bias_opt(top_data, bias);
-			}
-		}
-	}
-
-	//CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-	//CHECK_BLOB_DATA(top[0],20, "top[0]");
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    this->opt_num2 = global_packing_N;
+    this->weight_offset_ = this->M_ * this->K_;
+    for (int n = 0; n < this->num_; n += this->opt_num2) {
+      this->opt_num2 =
+          this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+      //intermediate variables to pass offset
+      this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
+      this->top_offset_ = top[i]->offset(n);
+      this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->forward_gpu_gemm_opt(bottom_data, weight, top_data);
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias_opt(top_data, bias);
+      }
+    }
+  }
+
+  //CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
 
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_org(
-		const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->gpu_data();
-		//CHECK_BLOB_DATA(bottom[i],10,"bottom");
-
-		Dtype* top_data = top[i]->mutable_gpu_data();
-		for (int n = 0; n < this->num_; ++n) {
-			//two intermediate variables to pass offset
-			this->bottom_offset_ = bottom[i]->offset(n);
-			this->top_offset_ = top[i]->offset(n);
-			this->forward_gpu_gemm(bottom_data, weight,
-					top_data);
-
-			if (this->bias_term_) {
-				const Dtype* bias = this->blobs_[1]->gpu_data();
-				this->forward_gpu_bias(top_data, bias);
-			}
-		}
-	}
-
-	// CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
-	//CHECK_BLOB_DATA(top[0],20, "top[0]");
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      //two intermediate variables to pass offset
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->top_offset_ = top[i]->offset(n);
+      this->forward_gpu_gemm(bottom_data, weight, top_data);
+
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
+
+  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->gpu_diff();
-
-		// Bias gradient, if necessary.
-		if (this->bias_term_ && this->param_propagate_down_[1]) {
-			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-			ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
-			for (int n = 0; n < this->num_; ++n) {
-				this->top_offset_ = top[i]->offset(n);
-				this->backward_gpu_bias(bias_diff, top_diff);
-			}
-		}
-		if (this->param_propagate_down_[0] || propagate_down[i]) {
-			const Dtype* bottom_data = bottom[i]->gpu_data();
-			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-			this->weight_offset_ = this->M_ * this->K_;
-			this->opt_num2 = global_packing_N;
-			for (int n = 0; n < this->num_; n += this->opt_num2) {
-				this->opt_num2 =
-						this->opt_num2 > (this->num_ - n) ?
-																								(this->num_ - n) :
-																								this->opt_num2;
-				this->top_offset_ = top[i]->offset(n);
-				this->bottom_offset_ = bottom[i]->offset(n);
-				this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
-				this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
-				// gradient w.r.t. weight. Note that we will accumulate diffs.
-				if (this->param_propagate_down_[0]) {
-					this->weight_gpu_gemm_opt(bottom_data,
-							top_diff, weight_diff);
-				}
-				// gradient w.r.t. bottom data, if necessary.
-				if (propagate_down[i]) {
-					this->backward_gpu_gemm_opt(top_diff, weight,
-							bottom_diff);
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      this->weight_offset_ = this->M_ * this->K_;
+      this->opt_num2 = global_packing_N;
+      for (int n = 0; n < this->num_; n += this->opt_num2) {
+        this->opt_num2 =
+            this->opt_num2 > (this->num_ - n) ?
+                (this->num_ - n) : this->opt_num2;
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm_opt(top_diff, weight, bottom_diff);
+        }
+      }
+    }
+  }
 
 }
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->gpu_diff();
-
-		// Bias gradient, if necessary.
-		if (this->bias_term_ && this->param_propagate_down_[1]) {
-			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-			for (int n = 0; n < this->num_; ++n) {
-				//
-				this->top_offset_ = top[i]->offset(n);
-				this->bottom_offset_ = bottom[i]->offset(n);
-				this->backward_gpu_bias(bias_diff, top_diff);
-			}
-		}
-		if (this->param_propagate_down_[0] || propagate_down[i]) {
-			const Dtype* bottom_data = bottom[i]->gpu_data();
-			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-			for (int n = 0; n < this->num_; ++n) {
-				this->top_offset_ = top[i]->offset(n);
-				this->bottom_offset_ = bottom[i]->offset(n);
-				// gradient w.r.t. weight. Note that we will accumulate diffs.
-				if (this->param_propagate_down_[0]) {
-					this->weight_gpu_gemm(bottom_data,
-							top_diff, weight_diff);
-				}
-				// gradient w.r.t. bottom data, if necessary.
-				if (propagate_down[i]) {
-					this->backward_gpu_gemm(top_diff, weight,
-							bottom_diff);
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        //
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(bottom_data, top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm(top_diff, weight, bottom_diff);
+        }
+      }
+    }
+  }
 
 }
 // end: code written/modified by AMD
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index e9ee5221..fdae75a0 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -18,108 +18,108 @@ namespace caffe {
 
 template <typename Dtype>
 DataLayer<Dtype>::~DataLayer<Dtype>() {
-	this->JoinPrefetchThread();
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// Initialize DB
-	db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
-	db_->Open(this->layer_param_.data_param().source(), db::READ);
-	cursor_.reset(db_->NewCursor());
+    const vector<Blob<Dtype>*>& top) {
+  // Initialize DB
+  db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
+  db_->Open(this->layer_param_.data_param().source(), db::READ);
+  cursor_.reset(db_->NewCursor());
 
-	// Check if we should randomly skip a few data points
-	if (this->layer_param_.data_param().rand_skip()) {
-		unsigned int skip = caffe_rng_rand() %
-				this->layer_param_.data_param().rand_skip();
-		LOG(INFO) << "Skipping first " << skip << " data points.";
-		while (skip-- > 0) {
-			cursor_->Next();
-		}
-	}
-	// Read a data point, to initialize the prefetch and top blobs.
-	Datum datum;
-	datum.ParseFromString(cursor_->value());
-	// Use data_transformer to infer the expected blob shape from datum.
-	vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-	this->transformed_data_.Reshape(top_shape);
-	// Reshape top[0] and prefetch_data according to the batch_size.
-	top_shape[0] = this->layer_param_.data_param().batch_size();
-	this->prefetch_data_.Reshape(top_shape);
-	top[0]->ReshapeLike(this->prefetch_data_);
-	this->prefetch_data_.set_data_layer();
+  // Check if we should randomly skip a few data points
+  if (this->layer_param_.data_param().rand_skip()) {
+    unsigned int skip = caffe_rng_rand()
+        % this->layer_param_.data_param().rand_skip();
+    LOG(INFO) << "Skipping first " << skip << " data points.";
+    while (skip-- > 0) {
+      cursor_->Next();
+    }
+  }
+  // Read a data point, to initialize the prefetch and top blobs.
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
+  // Use data_transformer to infer the expected blob shape from datum.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape top[0] and prefetch_data according to the batch_size.
+  top_shape[0] = this->layer_param_.data_param().batch_size();
+  this->prefetch_data_.Reshape(top_shape);
+  top[0]->ReshapeLike(this->prefetch_data_);
+  this->prefetch_data_.set_data_layer();
 
-	LOG(INFO) << "output data size: " << top[0]->num() << ","
-			<< top[0]->channels() << "," << top[0]->height() << ","
-			<< top[0]->width();
-	// label
-	if (this->output_labels_) {
-		vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
-		top[1]->Reshape(label_shape);
-		this->prefetch_label_.Reshape(label_shape);
-		this->prefetch_label_.set_data_layer();
-	}
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  if (this->output_labels_) {
+    vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
+    top[1]->Reshape(label_shape);
+    this->prefetch_label_.Reshape(label_shape);
+    this->prefetch_label_.set_data_layer();
+  }
 }
 
 // This function is used to create a thread that prefetches the data.
 template <typename Dtype>
 void DataLayer<Dtype>::InternalThreadEntry() {
-	CPUTimer batch_timer;
-	batch_timer.Start();
-	double read_time = 0;
-	double trans_time = 0;
-	CPUTimer timer;
-	CHECK(this->prefetch_data_.count());
-	CHECK(this->transformed_data_.count());
+  CPUTimer batch_timer;
+  batch_timer.Start();
+  double read_time = 0;
+  double trans_time = 0;
+  CPUTimer timer;
+  CHECK(this->prefetch_data_.count());
+  CHECK(this->transformed_data_.count());
 
-	// Reshape according to the first datum of each batch
-	// on single input batches allows for inputs of varying dimension.
-	const int batch_size = this->layer_param_.data_param().batch_size();
-	Datum datum;
-	datum.ParseFromString(cursor_->value());
-	// Use data_transformer to infer the expected blob shape from datum.
-	vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-	this->transformed_data_.Reshape(top_shape);
-	// Reshape prefetch_data according to the batch_size.
-	top_shape[0] = batch_size;
-	this->prefetch_data_.Reshape(top_shape);
+  // Reshape according to the first datum of each batch
+  // on single input batches allows for inputs of varying dimension.
+  const int batch_size = this->layer_param_.data_param().batch_size();
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
+  // Use data_transformer to infer the expected blob shape from datum.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape prefetch_data according to the batch_size.
+  top_shape[0] = batch_size;
+  this->prefetch_data_.Reshape(top_shape);
 
-	Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
-	Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
+  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
-	if (this->output_labels_) {
-		top_label = this->prefetch_label_.mutable_cpu_data();
-	}
-	timer.Start();
-	for (int item_id = 0; item_id < batch_size; ++item_id) {
-		// get a datum
-		Datum datum;
-		datum.ParseFromString(cursor_->value());
-		read_time += timer.MicroSeconds();
-		timer.Start();
-		// Apply data transformations (mirror, scale, crop...)
-		int offset = this->prefetch_data_.offset(item_id);
-		this->transformed_data_.set_cpu_data(top_data + offset);
-		this->data_transformer_->Transform(datum, &(this->transformed_data_));
-		// Copy label.
-		if (this->output_labels_) {
-			top_label[item_id] = datum.label();
-		}
-		trans_time += timer.MicroSeconds();
-		timer.Start();
-		// go to the next item.
-		cursor_->Next();
-		if (!cursor_->valid()) {
-			DLOG(INFO) << "Restarting data prefetching from start.";
-			cursor_->SeekToFirst();
-		}
-	}
-	timer.Stop();
-	batch_timer.Stop();
-	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+  if (this->output_labels_) {
+    top_label = this->prefetch_label_.mutable_cpu_data();
+  }
+  timer.Start();
+  for (int item_id = 0; item_id < batch_size; ++item_id) {
+    // get a datum
+    Datum datum;
+    datum.ParseFromString(cursor_->value());
+    read_time += timer.MicroSeconds();
+    timer.Start();
+    // Apply data transformations (mirror, scale, crop...)
+    int offset = this->prefetch_data_.offset(item_id);
+    this->transformed_data_.set_cpu_data(top_data + offset);
+    this->data_transformer_->Transform(datum, &(this->transformed_data_));
+    // Copy label.
+    if (this->output_labels_) {
+      top_label[item_id] = datum.label();
+    }
+    trans_time += timer.MicroSeconds();
+    timer.Start();
+    // go to the next item.
+    cursor_->Next();
+    if (!cursor_->valid()) {
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      cursor_->SeekToFirst();
+    }
+  }
+  timer.Stop();
+  batch_timer.Stop();
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
 INSTANTIATE_CLASS (DataLayer);
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 402a787e..8ee81c9f 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -10,119 +10,119 @@ namespace caffe {
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::compute_output_shape() {
-	this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
-			- 2 * this->pad_h_;
-	this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
-			- 2 * this->pad_w_;
+  this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
+      - 2 * this->pad_h_;
+  this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
+      - 2 * this->pad_w_;
 }
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* weight = this->blobs_[0]->cpu_data();
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->cpu_data();
-		Dtype* top_data = top[i]->mutable_cpu_data();
-		for (int n = 0; n < this->num_; ++n) {
-			this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-					top_data + top[i]->offset(n));
-			if (this->bias_term_) {
-				const Dtype* bias = this->blobs_[1]->cpu_data();
-				this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
-			}
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    Dtype* top_data = top[i]->mutable_cpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
+          top_data + top[i]->offset(n));
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->cpu_data();
+        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* weight = this->blobs_[0]->cpu_data();
-	Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->cpu_diff();
-		const Dtype* bottom_data = bottom[i]->cpu_data();
-		Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-		// Bias gradient, if necessary.
-		if (this->bias_term_ && this->param_propagate_down_[1]) {
-			Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
-			for (int n = 0; n < this->num_; ++n) {
-				this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
-			}
-		}
-		if (this->param_propagate_down_[0] || propagate_down[i]) {
-			for (int n = 0; n < this->num_; ++n) {
-				// Gradient w.r.t. weight. Note that we will accumulate diffs.
-				if (this->param_propagate_down_[0]) {
-					this->weight_cpu_gemm(top_diff + top[i]->offset(n),
-							bottom_data + bottom[i]->offset(n), weight_diff);
-				}
-				// Gradient w.r.t. bottom data, if necessary, reusing the column buffer
-				// we might have just computed above.
-				if (propagate_down[i]) {
-					this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-							bottom_diff + bottom[i]->offset(n),
-							this->param_propagate_down_[0]);
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->cpu_diff();
+    const Dtype* bottom_data = bottom[i]->cpu_data();
+    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      for (int n = 0; n < this->num_; ++n) {
+        // Gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_cpu_gemm(top_diff + top[i]->offset(n),
+              bottom_data + bottom[i]->offset(n), weight_diff);
+        }
+        // Gradient w.r.t. bottom data, if necessary, reusing the column buffer
+        // we might have just computed above.
+        if (propagate_down[i]) {
+          this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n),
+              this->param_propagate_down_[0]);
+        }
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	for (int i = 0; i < bottom.size(); ++i) {
-		const Dtype* bottom_data = bottom[i]->gpu_data();
-		Dtype* top_data = top[i]->mutable_gpu_data();
-		for (int n = 0; n < this->num_; ++n) {
-			this->bottom_offset_ = bottom[i]->offset(n);
-			this->top_offset_ = top[i]->offset(n);
-			this->backward_gpu_gemm(bottom_data, weight, top_data);
-			if (this->bias_term_) {
-				const Dtype* bias = this->blobs_[1]->gpu_data();
-				this->forward_gpu_bias(top_data, bias);
-			}
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->top_offset_ = top[i]->offset(n);
+      this->backward_gpu_gemm(bottom_data, weight, top_data);
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->gpu_diff();
-		const Dtype* bottom_data = bottom[i]->gpu_data();
-		Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-		// Bias gradient, if necessary.
-		if (this->bias_term_ && this->param_propagate_down_[1]) {
-			Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-			for (int n = 0; n < this->num_; ++n) {
-				this->top_offset_ = top[i]->offset(n);
-				this->bottom_offset_ = bottom[i]->offset(n);
-				this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-			}
-		}
-		if (this->param_propagate_down_[0] || propagate_down[i]) {
-			for (int n = 0; n < this->num_; ++n) {
-				this->top_offset_ = top[i]->offset(n);
-				this->bottom_offset_ = bottom[i]->offset(n);
-				// gradient w.r.t. weight. Note that we will accumulate diffs.
-				if (this->param_propagate_down_[0]) {
-					this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-							bottom_data + bottom[i]->offset(n), weight_diff);
-				}
-				// gradient w.r.t. bottom data, if necessary.
-				if (propagate_down[i]) {
-					this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-							bottom_diff + bottom[i]->offset(n));
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
+              bottom_data + bottom[i]->offset(n), weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
+              bottom_diff + bottom[i]->offset(n));
+        }
+      }
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 6692f238..f717fdbb 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -12,69 +12,67 @@ namespace caffe {
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::ocl_setup(int bottom_count) {
-	MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			bottom_count * sizeof(int), NULL, NULL);
+  MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      bottom_count * sizeof(int), NULL, NULL);
 }
 
 template <typename Dtype>
 DropoutLayer<Dtype>::~DropoutLayer() {
-	OCL_CHECK (clReleaseMemObject(MaskMem) );
-	}template <typename Dtype>
+  OCL_CHECK (clReleaseMemObject(MaskMem) );}template <typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	threshold_ = this->layer_param_.dropout_param().dropout_ratio();
-	DCHECK(threshold_ > 0.);
-	DCHECK(threshold_ < 1.);
-	scale_ = 1. / (1. - threshold_);
-	uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
-	ocl_setup(bottom[0]->count());
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  threshold_ = this->layer_param_.dropout_param().dropout_ratio();
+  DCHECK(threshold_ > 0.);
+  DCHECK(threshold_ < 1.);
+  scale_ = 1. / (1. - threshold_);
+  uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+  ocl_setup(bottom[0]->count());
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::Reshape(bottom, top);
-	// Set up the cache for random number generation
-	rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			bottom[0]->height(), bottom[0]->width());
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::Reshape(bottom, top);
+  // Set up the cache for random number generation
+  rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	unsigned int* mask = rand_vec_.mutable_cpu_data();
-	const int count = bottom[0]->count();
-	if (this->phase_ == TRAIN) {
-		// Create random numbers
-		caffe_rng_bernoulli(count, 1. - threshold_, mask);
-		for (int i = 0; i < count; ++i) {
-			top_data[i] = bottom_data[i] * mask[i] * scale_;
-		}
-	} else {
-		caffe_copy(bottom[0]->count(), bottom_data, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  unsigned int* mask = rand_vec_.mutable_cpu_data();
+  const int count = bottom[0]->count();
+  if (this->phase_ == TRAIN) {
+    // Create random numbers
+    caffe_rng_bernoulli(count, 1. - threshold_, mask);
+    for (int i = 0; i < count; ++i) {
+      top_data[i] = bottom_data[i] * mask[i] * scale_;
+    }
+  } else {
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  }
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_diff = top[0]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		if (this->phase_ == TRAIN) {
-			const unsigned int* mask = rand_vec_.cpu_data();
-			const int count = bottom[0]->count();
-			for (int i = 0; i < count; ++i) {
-				bottom_diff[i] = top_diff[i] * mask[i] * scale_;
-			}
-		} else {
-			caffe_copy(top[0]->count(), top_diff, bottom_diff);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    if (this->phase_ == TRAIN) {
+      const unsigned int* mask = rand_vec_.cpu_data();
+      const int count = bottom[0]->count();
+      for (int i = 0; i < count; ++i) {
+        bottom_diff[i] = top_diff[i] * mask[i] * scale_;
+      }
+    } else {
+      caffe_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
 }
 
 #define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\
@@ -97,49 +95,48 @@ do{ \
 // begin: code is written/modified by AMD
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	if (this->phase_ == TRAIN) {
-		//unsigned int* mask =
-		//  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  if (this->phase_ == TRAIN) {
+    //unsigned int* mask =
+    //  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
 #ifdef use_cpu_generator_dropout 
-		unsigned int* mask_cpu =
-		static_cast<unsigned int*>(rand_vec_.mutable_cpu_data());
-		caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
-		OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
-		DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
+    unsigned int* mask_cpu =
+    static_cast<unsigned int*>(rand_vec_.mutable_cpu_data());
+    caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
+    OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
+    DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
 #else
-		caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1.,
-				threshold_);
-		DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_,
-				top_data);
+    caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1.,
+        threshold_);
+    DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_,
+        top_data);
 #endif
-	} else {
-             if(bottom_data != top_data)
-		caffe_gpu_copy(count, bottom_data, top_data);
-	}
-CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
+  } else {
+    if (bottom_data != top_data)
+      caffe_gpu_copy(count, bottom_data, top_data);
+  }
+  CHECK_GLOBAL_INT_MEM_DATA((int* )MaskMem, bottom[0]->count(), 20, "Mask");
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		if (this->phase_ == TRAIN) {
-			const int count = bottom[0]->count();
-			DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_,
-					(Dtype) scale_, bottom_diff);
-		} else {
-			caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
-		}
-               CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
-               CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff");
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    if (this->phase_ == TRAIN) {
+      const int count = bottom[0]->count();
+      DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_,
+          (Dtype) scale_, bottom_diff);
+    } else {
+      caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+    CHECK_GLOBAL_INT_MEM_DATA((int* )MaskMem, bottom[0]->count(), 20, "Mask");
+    CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff");
+  }
 }
 // end: code is written/modified by AMD
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index a5225ea6..f13f3be1 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -8,105 +8,106 @@ namespace caffe {
 
 template <typename Dtype>
 void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int num_top = top.size();
-	const DummyDataParameter& param = this->layer_param_.dummy_data_param();
-	const int num_data_filler = param.data_filler_size();
-	CHECK(num_data_filler == 0 || num_data_filler == 1 ||
-			num_data_filler == num_top)
-			<< "Number of data fillers must be 0, 1 or equal to the number of tops: "
-			<< num_top << "; you specified " << num_data_filler << " data fillers.";
+    const vector<Blob<Dtype>*>& top) {
+  const int num_top = top.size();
+  const DummyDataParameter& param = this->layer_param_.dummy_data_param();
+  const int num_data_filler = param.data_filler_size();
+  CHECK(
+      num_data_filler == 0 || num_data_filler == 1
+          || num_data_filler == num_top)
+      << "Number of data fillers must be 0, 1 or equal to the number of tops: "
+      << num_top << "; you specified " << num_data_filler << " data fillers.";
 
-	const bool legacy_dims = param.num_size() || param.channels_size() ||
-			param.height_size() || param.width_size();
-	if (legacy_dims) {
-		CHECK_EQ(0, param.shape_size())
-				<< "Both shape and legacy fields were specified";
-		// Using deprecated 4D output dim specifiers.
-		CHECK(param.num_size() == 1 || param.num_size() == num_top)
-				<< "Must specify 'num' once, or once per top blob "
-				<< "(" << num_top << "); specified " << param.num_size() << ".";
-		CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-				<< "Must specify 'channels' once, or once per top blob "
-				<< "(" << num_top << "); specified " << param.channels_size() << ".";
-		CHECK(param.height_size() == 1 || param.height_size() == num_top)
-				<< "Must specify 'height' once, or once per top blob "
-				<< "(" << num_top << "); specified " << param.height_size() << ".";
-		CHECK(param.width_size() == 1 || param.width_size() == num_top)
-				<< "Must specify 'width' once, or once per top blob "
-				<< "(" << num_top << "); specified " << param.width_size() << ".";
-	} else {
-		CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
-				<< "Must specify 'shape' once, or once per top blob "
-				<< "(" << num_top << "); specified " << param.shape_size() << ".";
-	}
-	// refill_[i] tells Forward i whether or not to actually refill top Blob i.
-	// If refill_[i] is false, Forward does nothing for Blob i. We use this to
-	// avoid wastefully refilling "constant" Blobs in every forward pass.
-	// We first fill refill_ in with the INVERSE of its final values.
-	// The first time we run Forward from the LayerSetUp method, we'll fill only
-	// Blobs for which refill_ is normally false.  These Blobs will never be
-	// filled again.
-	refill_.clear();
-	fillers_.clear();
-	if (num_data_filler <= 1) {
-		FillerParameter filler_param;
-		if (num_data_filler == 0) {
-			filler_param.set_type("constant");
-			filler_param.set_value(0);
-		} else {
-			filler_param.CopyFrom(param.data_filler(0));
-		}
-		// Refill on each iteration iff not using a constant filler,
-		// but use the inverse of this rule for the first run.
-		refill_.resize(1);
-		refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
-		fillers_.resize(1);
-		fillers_[0].reset(GetFiller < Dtype > (filler_param));
-	} else {
-		refill_.resize(num_top);
-		fillers_.resize(num_top);
-		for (int i = 0; i < num_top; ++i) {
-			fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i)));
-			// Refill on each iteration iff not using a constant filler,
-			// but use the inverse of this rule for the first run.
-			refill_[i] =
-					(strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
-		}
-	}
-	for (int i = 0; i < num_top; ++i) {
-		if (legacy_dims) {
-			const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
-			const int channels =
-					(param.channels_size() == 1) ? param.channels(0) : param.channels(i);
-			const int height =
-					(param.height_size() == 1) ? param.height(0) : param.height(i);
-			const int width =
-					(param.width_size() == 1) ? param.width(0) : param.width(i);
-			top[i]->Reshape(num, channels, height, width);
-		} else {
-			const int shape_index = (param.shape_size() == 1) ? 0 : i;
-			top[i]->Reshape(param.shape(shape_index));
-		}
-	}
-	// Run Forward once, with refill_ inverted, to fill the constant Blobs.
-	this->Forward(bottom, top);
-	// Invert the inverted refill_ values to refill the desired (non-constant)
-	// Blobs in every usual forward pass.
-	for (int i = 0; i < refill_.size(); ++i) {
-		refill_[i] = !refill_[i];
-	}
+  const bool legacy_dims = param.num_size() || param.channels_size()
+      || param.height_size() || param.width_size();
+  if (legacy_dims) {
+    CHECK_EQ(0, param.shape_size())
+        << "Both shape and legacy fields were specified";
+    // Using deprecated 4D output dim specifiers.
+    CHECK(param.num_size() == 1 || param.num_size() == num_top)
+        << "Must specify 'num' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.num_size() << ".";
+    CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
+        << "Must specify 'channels' once, or once per top blob " << "("
+        << num_top << "); specified " << param.channels_size() << ".";
+    CHECK(param.height_size() == 1 || param.height_size() == num_top)
+        << "Must specify 'height' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.height_size() << ".";
+    CHECK(param.width_size() == 1 || param.width_size() == num_top)
+        << "Must specify 'width' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.width_size() << ".";
+  } else {
+    CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
+        << "Must specify 'shape' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.shape_size() << ".";
+  }
+  // refill_[i] tells Forward i whether or not to actually refill top Blob i.
+  // If refill_[i] is false, Forward does nothing for Blob i. We use this to
+  // avoid wastefully refilling "constant" Blobs in every forward pass.
+  // We first fill refill_ in with the INVERSE of its final values.
+  // The first time we run Forward from the LayerSetUp method, we'll fill only
+  // Blobs for which refill_ is normally false.  These Blobs will never be
+  // filled again.
+  refill_.clear();
+  fillers_.clear();
+  if (num_data_filler <= 1) {
+    FillerParameter filler_param;
+    if (num_data_filler == 0) {
+      filler_param.set_type("constant");
+      filler_param.set_value(0);
+    } else {
+      filler_param.CopyFrom(param.data_filler(0));
+    }
+    // Refill on each iteration iff not using a constant filler,
+    // but use the inverse of this rule for the first run.
+    refill_.resize(1);
+    refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
+    fillers_.resize(1);
+    fillers_[0].reset(GetFiller < Dtype > (filler_param));
+  } else {
+    refill_.resize(num_top);
+    fillers_.resize(num_top);
+    for (int i = 0; i < num_top; ++i) {
+      fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i)));
+      // Refill on each iteration iff not using a constant filler,
+      // but use the inverse of this rule for the first run.
+      refill_[i] =
+          (strcmp(param.data_filler(i).type().c_str(), "constant") == 0);
+    }
+  }
+  for (int i = 0; i < num_top; ++i) {
+    if (legacy_dims) {
+      const int num = (param.num_size() == 1) ? param.num(0) : param.num(i);
+      const int channels =
+          (param.channels_size() == 1) ? param.channels(0) : param.channels(i);
+      const int height =
+          (param.height_size() == 1) ? param.height(0) : param.height(i);
+      const int width =
+          (param.width_size() == 1) ? param.width(0) : param.width(i);
+      top[i]->Reshape(num, channels, height, width);
+    } else {
+      const int shape_index = (param.shape_size() == 1) ? 0 : i;
+      top[i]->Reshape(param.shape(shape_index));
+    }
+  }
+  // Run Forward once, with refill_ inverted, to fill the constant Blobs.
+  this->Forward(bottom, top);
+  // Invert the inverted refill_ values to refill the desired (non-constant)
+  // Blobs in every usual forward pass.
+  for (int i = 0; i < refill_.size(); ++i) {
+    refill_[i] = !refill_[i];
+  }
 }
 
 template <typename Dtype>
 void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	for (int i = 0; i < top.size(); ++i) {
-		const int filler_id = (fillers_.size() > 1) ? i : 0;
-		if (refill_[filler_id]) {
-			fillers_[filler_id]->Fill(top[i]);
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < top.size(); ++i) {
+    const int filler_id = (fillers_.size() > 1) ? i : 0;
+    if (refill_[filler_id]) {
+      fillers_[filler_id]->Fill(top[i]);
+    }
+  }
 }
 
 INSTANTIATE_CLASS (DummyDataLayer);
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index b904ad39..e2e5e1ab 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -9,236 +9,236 @@ namespace caffe {
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-			|| this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
-			"Eltwise Layer takes one coefficient per bottom blob.";
-	CHECK(!(this->layer_param().eltwise_param().operation()
-			== EltwiseParameter_EltwiseOp_PROD
-			&& this->layer_param().eltwise_param().coeff_size())) <<
-			"Eltwise layer only takes coefficients for summation.";
-	op_ = this->layer_param_.eltwise_param().operation();
-	// Blob-wise coefficients for the elementwise operation.
-	coeffs_ = vector < Dtype > (bottom.size(), 1);
-	if (this->layer_param().eltwise_param().coeff_size()) {
-		for (int i = 0; i < bottom.size(); ++i) {
-			coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
-		}
-	}
-	stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
+    const vector<Blob<Dtype>*>& top) {
+  CHECK(
+      this->layer_param().eltwise_param().coeff_size() == 0
+          || this->layer_param().eltwise_param().coeff_size() == bottom.size())
+      << "Eltwise Layer takes one coefficient per bottom blob.";
+  CHECK(
+      !(this->layer_param().eltwise_param().operation()
+          == EltwiseParameter_EltwiseOp_PROD
+          && this->layer_param().eltwise_param().coeff_size()))
+      << "Eltwise layer only takes coefficients for summation.";
+  op_ = this->layer_param_.eltwise_param().operation();
+  // Blob-wise coefficients for the elementwise operation.
+  coeffs_ = vector < Dtype > (bottom.size(), 1);
+  if (this->layer_param().eltwise_param().coeff_size()) {
+    for (int i = 0; i < bottom.size(); ++i) {
+      coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
+    }
+  }
+  stable_prod_grad_ = this->layer_param_.eltwise_param().stable_prod_grad();
 }
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	for (int i = 1; i < bottom.size(); ++i) {
-		CHECK(bottom[i]->shape() == bottom[0]->shape());
-	}
-	top[0]->ReshapeLike(*bottom[0]);
-	// If max operation, we will initialize the vector index part.
-	if (this->layer_param_.eltwise_param().operation() ==
-			EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
-		max_idx_.Reshape(bottom[0]->shape());
-	}
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 1; i < bottom.size(); ++i) {
+    CHECK(bottom[i]->shape() == bottom[0]->shape());
+  }
+  top[0]->ReshapeLike(*bottom[0]);
+  // If max operation, we will initialize the vector index part.
+  if (this->layer_param_.eltwise_param().operation()
+      == EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
+    max_idx_.Reshape(bottom[0]->shape());
+  }
 }
 
 template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	int* mask = NULL;
-	const Dtype* bottom_data_a = NULL;
-	const Dtype* bottom_data_b = NULL;
-	const int count = top[0]->count();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	switch (op_) {
-		case EltwiseParameter_EltwiseOp_PROD:
-			caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data);
-			for (int i = 2; i < bottom.size(); ++i) {
-				caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data);
-			}
-			break;
-		case EltwiseParameter_EltwiseOp_SUM:
-			caffe_set(count, Dtype(0), top_data);
-			// TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
-			for (int i = 0; i < bottom.size(); ++i) {
-				caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
-			}
-			break;
-		case EltwiseParameter_EltwiseOp_MAX:
-			// Initialize
-			mask = max_idx_.mutable_cpu_data();
-			caffe_set(count, -1, mask);
-			caffe_set(count, Dtype(-FLT_MAX), top_data);
-			// bottom 0 & 1
-			bottom_data_a = bottom[0]->cpu_data();
-			bottom_data_b = bottom[1]->cpu_data();
-			for (int idx = 0; idx < count; ++idx) {
-				if (bottom_data_a[idx] > bottom_data_b[idx]) {
-					top_data[idx] = bottom_data_a[idx];  // maxval
-					mask[idx] = 0;  // maxid
-				} else {
-					top_data[idx] = bottom_data_b[idx];  // maxval
-					mask[idx] = 1;  // maxid
-				}
-			}
-			// bottom 2++
-			for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {
-				bottom_data_b = bottom[blob_idx]->cpu_data();
-				for (int idx = 0; idx < count; ++idx) {
-					if (bottom_data_b[idx] > top_data[idx]) {
-						top_data[idx] = bottom_data_b[idx];  // maxval
-						mask[idx] = blob_idx;  // maxid
-					}
-				}
-			}
-			break;
-		default:
-			LOG(FATAL) << "Unknown elementwise operation.";
-	}
+void EltwiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  int* mask = NULL;
+  const Dtype* bottom_data_a = NULL;
+  const Dtype* bottom_data_b = NULL;
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  switch (op_) {
+  case EltwiseParameter_EltwiseOp_PROD:
+    caffe_mul(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), top_data);
+    for (int i = 2; i < bottom.size(); ++i) {
+      caffe_mul(count, top_data, bottom[i]->cpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_SUM:
+    caffe_set(count, Dtype(0), top_data);
+    // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
+    for (int i = 0; i < bottom.size(); ++i) {
+      caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_MAX:
+    // Initialize
+    mask = max_idx_.mutable_cpu_data();
+    caffe_set(count, -1, mask);
+    caffe_set(count, Dtype(-FLT_MAX), top_data);
+    // bottom 0 & 1
+    bottom_data_a = bottom[0]->cpu_data();
+    bottom_data_b = bottom[1]->cpu_data();
+    for (int idx = 0; idx < count; ++idx) {
+      if (bottom_data_a[idx] > bottom_data_b[idx]) {
+        top_data[idx] = bottom_data_a[idx];  // maxval
+        mask[idx] = 0;  // maxid
+      } else {
+        top_data[idx] = bottom_data_b[idx];  // maxval
+        mask[idx] = 1;  // maxid
+      }
+    }
+    // bottom 2++
+    for (int blob_idx = 2; blob_idx < bottom.size(); ++blob_idx) {
+      bottom_data_b = bottom[blob_idx]->cpu_data();
+      for (int idx = 0; idx < count; ++idx) {
+        if (bottom_data_b[idx] > top_data[idx]) {
+          top_data[idx] = bottom_data_b[idx];  // maxval
+          mask[idx] = blob_idx;  // maxid
+        }
+      }
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown elementwise operation.";
+  }
 }
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const int* mask = NULL;
-	const int count = top[0]->count();
-	const Dtype* top_data = top[0]->cpu_data();
-	const Dtype* top_diff = top[0]->cpu_diff();
-	for (int i = 0; i < bottom.size(); ++i) {
-		if (propagate_down[i]) {
-			const Dtype* bottom_data = bottom[i]->cpu_data();
-			Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
-			switch (op_) {
-				case EltwiseParameter_EltwiseOp_PROD:
-					if (stable_prod_grad_) {
-						bool initialized = false;
-						for (int j = 0; j < bottom.size(); ++j) {
-							if (i == j) {
-								continue;
-							}
-							if (!initialized) {
-								caffe_copy(count, bottom[j]->cpu_data(), bottom_diff);
-								initialized = true;
-							} else {
-								caffe_mul(count, bottom[j]->cpu_data(), bottom_diff,
-										bottom_diff);
-							}
-						}
-					} else {
-						caffe_div(count, top_data, bottom_data, bottom_diff);
-					}
-					caffe_mul(count, bottom_diff, top_diff, bottom_diff);
-					break;
-				case EltwiseParameter_EltwiseOp_SUM:
-					if (coeffs_[i] == Dtype(1)) {
-						caffe_copy(count, top_diff, bottom_diff);
-					} else {
-						caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-					}
-					break;
-				case EltwiseParameter_EltwiseOp_MAX:
-					mask = max_idx_.cpu_data();
-					for (int index = 0; index < count; ++index) {
-						Dtype gradient = 0;
-						if (mask[index] == i) {
-							gradient += top_diff[index];
-						}
-						bottom_diff[index] = gradient;
-					}
-					break;
-				default:
-					LOG(FATAL) << "Unknown elementwise operation.";
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int* mask = NULL;
+  const int count = top[0]->count();
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->cpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+      switch (op_) {
+      case EltwiseParameter_EltwiseOp_PROD:
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom.size(); ++j) {
+            if (i == j) {
+              continue;
+            }
+            if (!initialized) {
+              caffe_copy(count, bottom[j]->cpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, bottom_diff);
+            }
+          }
+        } else {
+          caffe_div(count, top_data, bottom_data, bottom_diff);
+        }
+        caffe_mul(count, bottom_diff, top_diff, bottom_diff);
+        break;
+      case EltwiseParameter_EltwiseOp_SUM:
+        if (coeffs_[i] == Dtype(1)) {
+          caffe_copy(count, top_diff, bottom_diff);
+        } else {
+          caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+        }
+        break;
+      case EltwiseParameter_EltwiseOp_MAX:
+        mask = max_idx_.cpu_data();
+        for (int index = 0; index < count; ++index) {
+          Dtype gradient = 0;
+          if (mask[index] == i) {
+            gradient += top_diff[index];
+          }
+          bottom_diff[index] = gradient;
+        }
+        break;
+      default:
+        LOG(FATAL) << "Unknown elementwise operation.";
+      }
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int* mask = NULL;
-	const int count = top[0]->count();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	switch (op_) {
-		case EltwiseParameter_EltwiseOp_PROD:
-			caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-					top_data);
-			for (int i = 2; i < bottom.size(); ++i) {
-				caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-			}
-			break;
-		case EltwiseParameter_EltwiseOp_SUM:
-			caffe_gpu_set(count, Dtype(0.), top_data);
-			// TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-			for (int i = 0; i < bottom.size(); ++i) {
-				caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-			}
-			break;
-		case EltwiseParameter_EltwiseOp_MAX:
-			mask = max_idx_.mutable_gpu_data();
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0,
-					top_data, mask);
-			for (int i = 2; i < bottom.size(); ++i) {
-				// NOLINT_NEXT_LINE(whitespace/operators)
-				MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data,
-						mask);
-			}
-			break;
-		default:
-			LOG(FATAL) << "Unknown elementwise operation.";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  int* mask = NULL;
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  switch (op_) {
+  case EltwiseParameter_EltwiseOp_PROD:
+    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+        top_data);
+    for (int i = 2; i < bottom.size(); ++i) {
+      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_SUM:
+    caffe_gpu_set(count, Dtype(0.), top_data);
+    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
+    for (int i = 0; i < bottom.size(); ++i) {
+      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_MAX:
+    mask = max_idx_.mutable_gpu_data();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data,
+        mask);
+    for (int i = 2; i < bottom.size(); ++i) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, mask);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown elementwise operation.";
+  }
 }
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const int* mask = NULL;
-	const int count = top[0]->count();
-	const Dtype* top_data = top[0]->gpu_data();
-	const Dtype* top_diff = top[0]->gpu_diff();
-	for (int i = 0; i < bottom.size(); ++i) {
-		if (propagate_down[i]) {
-			const Dtype* bottom_data = bottom[i]->gpu_data();
-			Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-			switch (op_) {
-				case EltwiseParameter_EltwiseOp_PROD:
-					if (stable_prod_grad_) {
-						bool initialized = false;
-						for (int j = 0; j < bottom.size(); ++j) {
-							if (i == j) {
-								continue;
-							}
-							if (!initialized) {
-								caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
-								initialized = true;
-							} else {
-								caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-										bottom_diff);
-							}
-						}
-					} else {
-						caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-					}
-					caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-					break;
-				case EltwiseParameter_EltwiseOp_SUM:
-					if (coeffs_[i] == Dtype(1.)) {
-						caffe_gpu_copy(count, top_diff, bottom_diff);
-					} else {
-						caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-					}
-					break;
-				case EltwiseParameter_EltwiseOp_MAX:
-					mask = max_idx_.gpu_data();
-					MaxBackward(count, top_diff, i, mask, bottom_diff);
-					break;
-				default:
-					LOG(FATAL) << "Unknown elementwise operation.";
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int* mask = NULL;
+  const int count = top[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      switch (op_) {
+      case EltwiseParameter_EltwiseOp_PROD:
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom.size(); ++j) {
+            if (i == j) {
+              continue;
+            }
+            if (!initialized) {
+              caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
+                  bottom_diff);
+            }
+          }
+        } else {
+          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        }
+        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+        break;
+      case EltwiseParameter_EltwiseOp_SUM:
+        if (coeffs_[i] == Dtype(1.)) {
+          caffe_gpu_copy(count, top_diff, bottom_diff);
+        } else {
+          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+        }
+        break;
+      case EltwiseParameter_EltwiseOp_MAX:
+        mask = max_idx_.gpu_data();
+        MaxBackward(count, top_diff, i, mask, bottom_diff);
+        break;
+      default:
+        LOG(FATAL) << "Unknown elementwise operation.";
+      }
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 9107f119..fce99953 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -8,76 +8,68 @@
 namespace caffe {
 
 template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::Reshape(bottom, top);
-	CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
-			<< "Inputs must have the same dimension.";
-	diff_.ReshapeLike(*bottom[0]);
+void EuclideanLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
+      << "Inputs must have the same dimension.";
+  diff_.ReshapeLike(*bottom[0]);
 }
 
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int count = bottom[0]->count();
-	caffe_sub(
-			count,
-			bottom[0]->cpu_data(),
-			bottom[1]->cpu_data(),
-			diff_.mutable_cpu_data());
-	Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
-	Dtype loss = dot / bottom[0]->num() / Dtype(2);
-	top[0]->mutable_cpu_data()[0] = loss;
+    const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(),
+      diff_.mutable_cpu_data());
+  Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
+  Dtype loss = dot / bottom[0]->num() / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	for (int i = 0; i < 2; ++i) {
-		if (propagate_down[i]) {
-			const Dtype sign = (i == 0) ? 1 : -1;
-			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-			caffe_cpu_axpby(
-					bottom[i]->count(),              // count
-					alpha,                              // alpha
-					diff_.cpu_data(),                   // a
-					Dtype(0),                           // beta
-					bottom[i]->mutable_cpu_diff());  // b
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_cpu_axpby(bottom[i]->count(),              // count
+          alpha,                              // alpha
+          diff_.cpu_data(),                   // a
+          Dtype(0),                           // beta
+          bottom[i]->mutable_cpu_diff());  // b
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int count = bottom[0]->count();
-	caffe_gpu_sub(
-			count,
-			bottom[0]->gpu_data(),
-			bottom[1]->gpu_data(),
-			diff_.mutable_gpu_data());
-	Dtype dot;
-	caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-	Dtype loss = dot / bottom[0]->num() / Dtype(2);
-	top[0]->mutable_cpu_data()[0] = loss;
+    const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+      diff_.mutable_gpu_data());
+  Dtype dot;
+  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
+  Dtype loss = dot / bottom[0]->num() / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
 }
 
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	for (int i = 0; i < 2; ++i) {
-		if (propagate_down[i]) {
-			const Dtype sign = (i == 0) ? 1 : -1;
-			const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-			caffe_gpu_axpby(
-					bottom[i]->count(),              // count
-					alpha,                              // alpha
-					diff_.gpu_data(),                   // a
-					Dtype(0),                           // beta
-					bottom[i]->mutable_gpu_diff());  // b
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_gpu_axpby(bottom[i]->count(),              // count
+          alpha,                              // alpha
+          diff_.gpu_data(),                   // a
+          Dtype(0),                           // beta
+          bottom[i]->mutable_gpu_diff());  // b
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 087da677..25bcd0a0 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -9,90 +9,90 @@ namespace caffe {
 
 template <typename Dtype>
 void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	const Dtype base = this->layer_param_.exp_param().base();
-	if (base != Dtype(-1)) {
-		CHECK_GT(base, 0) << "base must be strictly positive.";
-	}
-	// If base == -1, interpret the base as e and set log_base = 1 exactly.
-	// Otherwise, calculate its log explicitly.
-	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-	CHECK(!isnan(log_base))
-			<< "NaN result: log(base) = log(" << base << ") = " << log_base;
-	CHECK(!isinf(log_base))
-			<< "Inf result: log(base) = log(" << base << ") = " << log_base;
-	const Dtype input_scale = this->layer_param_.exp_param().scale();
-	const Dtype input_shift = this->layer_param_.exp_param().shift();
-	inner_scale_ = log_base * input_scale;
-	outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  const Dtype base = this->layer_param_.exp_param().base();
+  if (base != Dtype(-1)) {
+    CHECK_GT(base, 0) << "base must be strictly positive.";
+  }
+  // If base == -1, interpret the base as e and set log_base = 1 exactly.
+  // Otherwise, calculate its log explicitly.
+  const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
+  CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = "
+      << log_base;
+  CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = "
+      << log_base;
+  const Dtype input_scale = this->layer_param_.exp_param().scale();
+  const Dtype input_shift = this->layer_param_.exp_param().shift();
+  inner_scale_ = log_base * input_scale;
+  outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : pow(base, input_shift);
 }
 
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	if (inner_scale_ == Dtype(1)) {
-		caffe_exp(count, bottom_data, top_data);
-	} else {
-		caffe_cpu_scale(count, inner_scale_, bottom_data, top_data);
-		caffe_exp(count, top_data, top_data);
-	}
-	if (outer_scale_ != Dtype(1)) {
-		caffe_scal(count, outer_scale_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  if (inner_scale_ == Dtype(1)) {
+    caffe_exp(count, bottom_data, top_data);
+  } else {
+    caffe_cpu_scale(count, inner_scale_, bottom_data, top_data);
+    caffe_exp(count, top_data, top_data);
+  }
+  if (outer_scale_ != Dtype(1)) {
+    caffe_scal(count, outer_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	const int count = bottom[0]->count();
-	const Dtype* top_data = top[0]->cpu_data();
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	caffe_mul(count, top_data, top_diff, bottom_diff);
-	if (inner_scale_ != Dtype(1)) {
-		caffe_scal(count, inner_scale_, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  caffe_mul(count, top_data, top_diff, bottom_diff);
+  if (inner_scale_ != Dtype(1)) {
+    caffe_scal(count, inner_scale_, bottom_diff);
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	if (inner_scale_ == Dtype(1)) {
-		caffe_gpu_exp(count, bottom_data, top_data);
-	} else {
-		caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-		caffe_gpu_exp(count, top_data, top_data);
-	}
-	if (outer_scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, outer_scale_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (inner_scale_ == Dtype(1)) {
+    caffe_gpu_exp(count, bottom_data, top_data);
+  } else {
+    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
+    caffe_gpu_exp(count, top_data, top_data);
+  }
+  if (outer_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, outer_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	const int count = bottom[0]->count();
-	const Dtype* top_data = top[0]->gpu_data();
-	const Dtype* top_diff = top[0]->gpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-	if (inner_scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, inner_scale_, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
+  if (inner_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, inner_scale_, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index 05dc2783..fc3ca142 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -9,172 +9,172 @@ namespace caffe {
 
 template <typename Dtype>
 void FilterLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(top.size(), bottom.size() - 1);
-	first_reshape_ = true;
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(top.size(), bottom.size() - 1);
+  first_reshape_ = true;
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// bottom[0...k-1] are the blobs to filter
-	// bottom[last] is the "selector_blob"
-	int selector_index = bottom.size() - 1;
-	for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) {
-		CHECK_EQ(bottom[selector_index]->shape(i), 1)
-				<< "Selector blob dimensions must be singletons (1), except the first";
-	}
-	for (int i = 0; i < bottom.size() - 1; ++i) {
-		CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) <<
-				"Each bottom should have the same 0th dimension as the selector blob";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  // bottom[0...k-1] are the blobs to filter
+  // bottom[last] is the "selector_blob"
+  int selector_index = bottom.size() - 1;
+  for (int i = 1; i < bottom[selector_index]->num_axes(); ++i) {
+    CHECK_EQ(bottom[selector_index]->shape(i), 1)
+        << "Selector blob dimensions must be singletons (1), except the first";
+  }
+  for (int i = 0; i < bottom.size() - 1; ++i) {
+    CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0))
+        << "Each bottom should have the same 0th dimension as the selector blob";
+  }
 
-	const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
-	indices_to_forward_.clear();
+  const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
+  indices_to_forward_.clear();
 
-	// look for non-zero elements in bottom[0]. Items of each bottom that
-	// have the same index as the items in bottom[0] with value == non-zero
-	// will be forwarded
-	for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) {
-		// we don't need an offset because item size == 1
-		const Dtype* tmp_data_selector = bottom_data_selector + item_id;
-		if (*tmp_data_selector) {
-			indices_to_forward_.push_back(item_id);
-		}
-	}
-	// only filtered items will be forwarded
-	int new_tops_num = indices_to_forward_.size();
-	// init
-	if (first_reshape_) {
-		new_tops_num = bottom[0]->shape(0);
-		first_reshape_ = false;
-	}
-	for (int t = 0; t < top.size(); ++t) {
-		int num_axes = bottom[t]->num_axes();
-		vector<int> shape_top(num_axes);
-		shape_top[0] = new_tops_num;
-		for (int ts = 1; ts < num_axes; ++ts)
-			shape_top[ts] = bottom[t]->shape(ts);
-		top[t]->Reshape(shape_top);
-	}
+  // look for non-zero elements in bottom[0]. Items of each bottom that
+  // have the same index as the items in bottom[0] with value == non-zero
+  // will be forwarded
+  for (int item_id = 0; item_id < bottom[selector_index]->shape(0); ++item_id) {
+    // we don't need an offset because item size == 1
+    const Dtype* tmp_data_selector = bottom_data_selector + item_id;
+    if (*tmp_data_selector) {
+      indices_to_forward_.push_back(item_id);
+    }
+  }
+  // only filtered items will be forwarded
+  int new_tops_num = indices_to_forward_.size();
+  // init
+  if (first_reshape_) {
+    new_tops_num = bottom[0]->shape(0);
+    first_reshape_ = false;
+  }
+  for (int t = 0; t < top.size(); ++t) {
+    int num_axes = bottom[t]->num_axes();
+    vector<int> shape_top(num_axes);
+    shape_top[0] = new_tops_num;
+    for (int ts = 1; ts < num_axes; ++ts)
+      shape_top[ts] = bottom[t]->shape(ts);
+    top[t]->Reshape(shape_top);
+  }
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int new_tops_num = indices_to_forward_.size();
-	// forward all filtered items for all bottoms but the Selector (bottom[last])
-	for (int t = 0; t < top.size(); ++t) {
-		const Dtype* bottom_data = bottom[t]->cpu_data();
-		Dtype* top_data = top[t]->mutable_cpu_data();
-		int dim = bottom[t]->count() / bottom[t]->shape(0);
-		for (int n = 0; n < new_tops_num; ++n) {
-			int data_offset_top = n * dim;
-			int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1);
-			caffe_copy(dim, bottom_data + data_offset_bottom,
-					top_data + data_offset_top);
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  int new_tops_num = indices_to_forward_.size();
+  // forward all filtered items for all bottoms but the Selector (bottom[last])
+  for (int t = 0; t < top.size(); ++t) {
+    const Dtype* bottom_data = bottom[t]->cpu_data();
+    Dtype* top_data = top[t]->mutable_cpu_data();
+    int dim = bottom[t]->count() / bottom[t]->shape(0);
+    for (int n = 0; n < new_tops_num; ++n) {
+      int data_offset_top = n * dim;
+      int data_offset_bottom = indices_to_forward_[n] * bottom[t]->count(1);
+      caffe_copy(dim, bottom_data + data_offset_bottom,
+          top_data + data_offset_top);
+    }
+  }
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[bottom.size() - 1]) {
-		LOG(FATAL) << this->type()
-				<< "Layer cannot backpropagate to filter index inputs";
-	}
-	for (int i = 0; i < top.size(); i++) {
-		// bottom[last] is the selector and never needs backpropagation
-		// so we can iterate over top vector because top.size() == bottom.size() -1
-		if (propagate_down[i]) {
-			const int dim = top[i]->count() / top[i]->shape(0);
-			int next_to_backward_offset = 0;
-			int batch_offset = 0;
-			int data_offset_bottom = 0;
-			int data_offset_top = 0;
-			for (int n = 0; n < bottom[i]->shape(0); n++) {
-				data_offset_bottom = n * dim;
-				if (next_to_backward_offset >= indices_to_forward_.size()) {
-					// we already visited all items that were been forwarded, so
-					// just set to zero remaining ones
-					caffe_set(dim, Dtype(0),
-							bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-				} else {
-					batch_offset = indices_to_forward_[next_to_backward_offset];
-					if (n != batch_offset) {  // this data was not been forwarded
-						caffe_set(dim, Dtype(0),
-								bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-					} else {  // this data was been forwarded
-						data_offset_top = next_to_backward_offset * dim;
-						next_to_backward_offset++;  // point to next forwarded item index
-						caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top,
-								bottom[i]->mutable_cpu_diff() + data_offset_bottom);
-					}
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[bottom.size() - 1]) {
+    LOG(FATAL) << this->type()
+        << "Layer cannot backpropagate to filter index inputs";
+  }
+  for (int i = 0; i < top.size(); i++) {
+    // bottom[last] is the selector and never needs backpropagation
+    // so we can iterate over top vector because top.size() == bottom.size() -1
+    if (propagate_down[i]) {
+      const int dim = top[i]->count() / top[i]->shape(0);
+      int next_to_backward_offset = 0;
+      int batch_offset = 0;
+      int data_offset_bottom = 0;
+      int data_offset_top = 0;
+      for (int n = 0; n < bottom[i]->shape(0); n++) {
+        data_offset_bottom = n * dim;
+        if (next_to_backward_offset >= indices_to_forward_.size()) {
+          // we already visited all items that were been forwarded, so
+          // just set to zero remaining ones
+          caffe_set(dim, Dtype(0),
+              bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+        } else {
+          batch_offset = indices_to_forward_[next_to_backward_offset];
+          if (n != batch_offset) {  // this data was not been forwarded
+            caffe_set(dim, Dtype(0),
+                bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+          } else {  // this data was been forwarded
+            data_offset_top = next_to_backward_offset * dim;
+            next_to_backward_offset++;  // point to next forwarded item index
+            caffe_copy(dim, top[i]->mutable_cpu_diff() + data_offset_top,
+                bottom[i]->mutable_cpu_diff() + data_offset_bottom);
+          }
+        }
+      }
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int new_tops_num = indices_to_forward_.size();
-	// forward all filtered items for all bottoms but the Selector (bottom[last])
-	for (int t = 0; t < top.size(); ++t) {
-		const Dtype* bottom_data = bottom[t]->gpu_data();
-		Dtype* top_data = top[t]->mutable_gpu_data();
-		int dim = bottom[t]->count() / bottom[t]->shape(0);
-		for (int n = 0; n < new_tops_num; ++n) {
-			int data_offset_top = n * dim;
-			int data_offset_bottom = indices_to_forward_[n] * dim;
-			caffe_copy(dim, bottom_data + data_offset_bottom,
-					top_data + data_offset_top);
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  int new_tops_num = indices_to_forward_.size();
+  // forward all filtered items for all bottoms but the Selector (bottom[last])
+  for (int t = 0; t < top.size(); ++t) {
+    const Dtype* bottom_data = bottom[t]->gpu_data();
+    Dtype* top_data = top[t]->mutable_gpu_data();
+    int dim = bottom[t]->count() / bottom[t]->shape(0);
+    for (int n = 0; n < new_tops_num; ++n) {
+      int data_offset_top = n * dim;
+      int data_offset_bottom = indices_to_forward_[n] * dim;
+      caffe_copy(dim, bottom_data + data_offset_bottom,
+          top_data + data_offset_top);
+    }
+  }
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[bottom.size() - 1]) {
-		LOG(FATAL) << this->type()
-				<< "Layer cannot backpropagate to filter index inputs";
-	}
-	for (int i = 0; i < top.size(); ++i) {
-		// bottom[last] is the selector and never needs backpropagation
-		// so we can iterate over top vector because top.size() == bottom.size() -1
-		if (propagate_down[i]) {
-			const int dim = top[i]->count() / top[i]->shape(0);
-			int next_to_backward_offset = 0;
-			int batch_offset = 0;
-			int data_offset_bottom = 0;
-			int data_offset_top = 0;
-			for (int n = 0; n < bottom[i]->shape(0); ++n) {
-				if (next_to_backward_offset >= indices_to_forward_.size()) {
-					// we already visited all items that were been forwarded, so
-					// just set to zero remaining ones
-					data_offset_bottom = n * dim;
-					caffe_gpu_set(dim, Dtype(0),
-							bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-				} else {
-					batch_offset = indices_to_forward_[next_to_backward_offset];
-					data_offset_bottom = n * dim;
-					if (n != batch_offset) {  // this data was not been forwarded
-						caffe_gpu_set(dim, Dtype(0),
-								bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-					} else {  // this data was been forwarded
-						data_offset_top = next_to_backward_offset * dim;
-						++next_to_backward_offset;  // point to next forwarded item index
-						caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-								bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-					}
-				}
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[bottom.size() - 1]) {
+    LOG(FATAL) << this->type()
+        << "Layer cannot backpropagate to filter index inputs";
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    // bottom[last] is the selector and never needs backpropagation
+    // so we can iterate over top vector because top.size() == bottom.size() -1
+    if (propagate_down[i]) {
+      const int dim = top[i]->count() / top[i]->shape(0);
+      int next_to_backward_offset = 0;
+      int batch_offset = 0;
+      int data_offset_bottom = 0;
+      int data_offset_top = 0;
+      for (int n = 0; n < bottom[i]->shape(0); ++n) {
+        if (next_to_backward_offset >= indices_to_forward_.size()) {
+          // we already visited all items that were been forwarded, so
+          // just set to zero remaining ones
+          data_offset_bottom = n * dim;
+          caffe_gpu_set(dim, Dtype(0),
+              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+        } else {
+          batch_offset = indices_to_forward_[next_to_backward_offset];
+          data_offset_bottom = n * dim;
+          if (n != batch_offset) {  // this data was not been forwarded
+            caffe_gpu_set(dim, Dtype(0),
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          } else {  // this data was been forwarded
+            data_offset_top = next_to_backward_offset * dim;
+            ++next_to_backward_offset;  // point to next forwarded item index
+            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          }
+        }
+      }
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index e79e9406..997f213d 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -8,34 +8,34 @@ namespace caffe {
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int start_axis = bottom[0]->CanonicalAxisIndex(
-			this->layer_param_.flatten_param().axis());
-	const int end_axis = bottom[0]->CanonicalAxisIndex(
-			this->layer_param_.flatten_param().end_axis());
-	vector<int> top_shape;
-	for (int i = 0; i < start_axis; ++i) {
-		top_shape.push_back(bottom[0]->shape(i));
-	}
-	const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1);
-	top_shape.push_back(flattened_dim);
-	for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) {
-		top_shape.push_back(bottom[0]->shape(i));
-	}
-	top[0]->Reshape(top_shape);
-	CHECK_EQ(top[0]->count(), bottom[0]->count());
+    const vector<Blob<Dtype>*>& top) {
+  const int start_axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.flatten_param().axis());
+  const int end_axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.flatten_param().end_axis());
+  vector<int> top_shape;
+  for (int i = 0; i < start_axis; ++i) {
+    top_shape.push_back(bottom[0]->shape(i));
+  }
+  const int flattened_dim = bottom[0]->count(start_axis, end_axis + 1);
+  top_shape.push_back(flattened_dim);
+  for (int i = end_axis + 1; i < bottom[0]->num_axes(); ++i) {
+    top_shape.push_back(bottom[0]->shape(i));
+  }
+  top[0]->Reshape(top_shape);
+  CHECK_EQ(top[0]->count(), bottom[0]->count());
 }
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	top[0]->ShareData(*bottom[0]);
+    const vector<Blob<Dtype>*>& top) {
+  top[0]->ShareData(*bottom[0]);
 }
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	bottom[0]->ShareDiff(*top[0]);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  bottom[0]->ShareDiff(*top[0]);
 }
 
 INSTANTIATE_CLASS (FlattenLayer);
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 6c6d8dec..2d7d405e 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -27,175 +27,174 @@ HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() {
 // Load data and label from HDF5 filename into the class property blobs.
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
-	DLOG(INFO) << "Loading HDF5 file: " << filename;
-	hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
-	if (file_id < 0) {
-		LOG(FATAL) << "Failed opening HDF5 file: " << filename;
-	}
-
-	int top_size = this->layer_param_.top_size();
-	hdf_blobs_.resize(top_size);
-
-	const int MIN_DATA_DIM = 1;
-	const int MAX_DATA_DIM = INT_MAX;
-
-	for (int i = 0; i < top_size; ++i) {
-		hdf_blobs_[i] = shared_ptr < Blob<Dtype> > (new Blob<Dtype>());
-		hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-				MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
-	}
-
-	herr_t status = H5Fclose(file_id);
-	CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename;
-
-	// MinTopBlobs==1 guarantees at least one top blob
-	CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis.";
-	const int num = hdf_blobs_[0]->shape(0);
-	for (int i = 1; i < top_size; ++i) {
-		CHECK_EQ(hdf_blobs_[i]->shape(0), num);
-	}
-	// Default to identity permutation.
-	data_permutation_.clear();
-	data_permutation_.resize(hdf_blobs_[0]->shape(0));
-	for (int i = 0; i < hdf_blobs_[0]->shape(0); i++)
-		data_permutation_[i] = i;
-
-	// Shuffle if needed.
-	if (this->layer_param_.hdf5_data_param().shuffle()) {
-		std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
-				<< " rows (shuffled)";
-	} else {
-		DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
-	}
+  DLOG(INFO) << "Loading HDF5 file: " << filename;
+  hid_t file_id = H5Fopen(filename, H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    LOG(FATAL) << "Failed opening HDF5 file: " << filename;
+  }
+
+  int top_size = this->layer_param_.top_size();
+  hdf_blobs_.resize(top_size);
+
+  const int MIN_DATA_DIM = 1;
+  const int MAX_DATA_DIM = INT_MAX;
+
+  for (int i = 0; i < top_size; ++i) {
+    hdf_blobs_[i] = shared_ptr < Blob<Dtype> > (new Blob<Dtype>());
+    hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
+        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+  }
+
+  herr_t status = H5Fclose(file_id);
+  CHECK_GE(status, 0) << "Failed to close HDF5 file: " << filename;
+
+  // MinTopBlobs==1 guarantees at least one top blob
+  CHECK_GE(hdf_blobs_[0]->num_axes(), 1) << "Input must have at least 1 axis.";
+  const int num = hdf_blobs_[0]->shape(0);
+  for (int i = 1; i < top_size; ++i) {
+    CHECK_EQ(hdf_blobs_[i]->shape(0), num);
+  }
+  // Default to identity permutation.
+  data_permutation_.clear();
+  data_permutation_.resize(hdf_blobs_[0]->shape(0));
+  for (int i = 0; i < hdf_blobs_[0]->shape(0); i++)
+    data_permutation_[i] = i;
+
+  // Shuffle if needed.
+  if (this->layer_param_.hdf5_data_param().shuffle()) {
+    std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+        << " rows (shuffled)";
+  } else {
+    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+  }
 }
 
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// Refuse transformation parameters since HDF5 is totally generic.
-	CHECK(!this->layer_param_.has_transform_param()) <<
-			this->type() << " does not transform data.";
-	// Read the source to parse the filenames.
-	const string& source = this->layer_param_.hdf5_data_param().source();
-	LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
-	hdf_filenames_.clear();
-	std::ifstream source_file(source.c_str());
-	if (source_file.is_open()) {
-		std::string line;
-		while (source_file >> line) {
-			hdf_filenames_.push_back(line);
-		}
-	} else {
-		LOG(FATAL) << "Failed to open source file: " << source;
-	}
-	source_file.close();
-	num_files_ = hdf_filenames_.size();
-	current_file_ = 0;
-	LOG(INFO) << "Number of HDF5 files: " << num_files_;
-	CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
-			<< source;
-
-	file_permutation_.clear();
-	file_permutation_.resize(num_files_);
-	// Default to identity permutation.
-	for (int i = 0; i < num_files_; i++) {
-		file_permutation_[i] = i;
-	}
-
-	// Shuffle if needed.
-	if (this->layer_param_.hdf5_data_param().shuffle()) {
-		std::random_shuffle(file_permutation_.begin(), file_permutation_.end());
-	}
-
-	// Load the first HDF5 file and initialize the line counter.
-	LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str());
-	current_row_ = 0;
-
-	// Reshape blobs.
-	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-	const int top_size = this->layer_param_.top_size();
-	vector<int> top_shape;
-	for (int i = 0; i < top_size; ++i) {
-		top_shape.resize(hdf_blobs_[i]->num_axes());
-		top_shape[0] = batch_size;
-		for (int j = 1; j < top_shape.size(); ++j) {
-			top_shape[j] = hdf_blobs_[i]->shape(j);
-		}
-		top[i]->Reshape(top_shape);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  // Refuse transformation parameters since HDF5 is totally generic.
+  CHECK(!this->layer_param_.has_transform_param()) << this->type()
+      << " does not transform data.";
+  // Read the source to parse the filenames.
+  const string& source = this->layer_param_.hdf5_data_param().source();
+  LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
+  hdf_filenames_.clear();
+  std::ifstream source_file(source.c_str());
+  if (source_file.is_open()) {
+    std::string line;
+    while (source_file >> line) {
+      hdf_filenames_.push_back(line);
+    }
+  } else {
+    LOG(FATAL) << "Failed to open source file: " << source;
+  }
+  source_file.close();
+  num_files_ = hdf_filenames_.size();
+  current_file_ = 0;
+  LOG(INFO) << "Number of HDF5 files: " << num_files_;
+  CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
+      << source;
+
+  file_permutation_.clear();
+  file_permutation_.resize(num_files_);
+  // Default to identity permutation.
+  for (int i = 0; i < num_files_; i++) {
+    file_permutation_[i] = i;
+  }
+
+  // Shuffle if needed.
+  if (this->layer_param_.hdf5_data_param().shuffle()) {
+    std::random_shuffle(file_permutation_.begin(), file_permutation_.end());
+  }
+
+  // Load the first HDF5 file and initialize the line counter.
+  LoadHDF5FileData(hdf_filenames_[file_permutation_[current_file_]].c_str());
+  current_row_ = 0;
+
+  // Reshape blobs.
+  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+  const int top_size = this->layer_param_.top_size();
+  vector<int> top_shape;
+  for (int i = 0; i < top_size; ++i) {
+    top_shape.resize(hdf_blobs_[i]->num_axes());
+    top_shape[0] = batch_size;
+    for (int j = 1; j < top_shape.size(); ++j) {
+      top_shape[j] = hdf_blobs_[i]->shape(j);
+    }
+    top[i]->Reshape(top_shape);
+  }
 }
 
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-	for (int i = 0; i < batch_size; ++i, ++current_row_) {
-		if (current_row_ == hdf_blobs_[0]->shape(0)) {
-			if (num_files_ > 1) {
-				++current_file_;
-				if (current_file_ == num_files_) {
-					current_file_ = 0;
-					if (this->layer_param_.hdf5_data_param().shuffle()) {
-						std::random_shuffle(file_permutation_.begin(),
-								file_permutation_.end());
-					}
-					DLOG(INFO) << "Looping around to first file.";
-				}
-				LoadHDF5FileData(
-						hdf_filenames_[file_permutation_[current_file_]].c_str());
-			}
-			current_row_ = 0;
-			if (this->layer_param_.hdf5_data_param().shuffle())
-				std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-		}
-		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-			int data_dim = top[j]->count() / top[j]->shape(0);
-			caffe_copy(data_dim,
-					&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-							* data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+  for (int i = 0; i < batch_size; ++i, ++current_row_) {
+    if (current_row_ == hdf_blobs_[0]->shape(0)) {
+      if (num_files_ > 1) {
+        ++current_file_;
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          if (this->layer_param_.hdf5_data_param().shuffle()) {
+            std::random_shuffle(file_permutation_.begin(),
+                file_permutation_.end());
+          }
+          DLOG(INFO) << "Looping around to first file.";
+        }
+        LoadHDF5FileData(
+            hdf_filenames_[file_permutation_[current_file_]].c_str());
+      }
+      current_row_ = 0;
+      if (this->layer_param_.hdf5_data_param().shuffle())
+        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+    }
+    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+      int data_dim = top[j]->count() / top[j]->shape(0);
+      caffe_copy(data_dim,
+          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim],
+          &top[j]->mutable_cpu_data()[i * data_dim]);
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-	for (int i = 0; i < batch_size; ++i, ++current_row_) {
-		if (current_row_ == hdf_blobs_[0]->shape(0)) {
-			if (num_files_ > 1) {
-				current_file_ += 1;
-				if (current_file_ == num_files_) {
-					current_file_ = 0;
-					if (this->layer_param_.hdf5_data_param().shuffle()) {
-						std::random_shuffle(file_permutation_.begin(),
-								file_permutation_.end());
-					}
-					DLOG(INFO) << "Looping around to first file.";
-				}
-				LoadHDF5FileData(
-						hdf_filenames_[file_permutation_[current_file_]].c_str());
-			}
-			current_row_ = 0;
-			if (this->layer_param_.hdf5_data_param().shuffle())
-				std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-		}
-		for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-			int data_dim = top[j]->count() / top[j]->shape(0);
-			OCL_CHECK(
-					clEnqueueWriteBuffer(amdDevice.CommandQueue,
-							(cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
-							i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
-							&hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-									* data_dim],
-							0, NULL, NULL));
-			//caffe_copy(data_dim,
-			//    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-			//      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+  for (int i = 0; i < batch_size; ++i, ++current_row_) {
+    if (current_row_ == hdf_blobs_[0]->shape(0)) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          if (this->layer_param_.hdf5_data_param().shuffle()) {
+            std::random_shuffle(file_permutation_.begin(),
+                file_permutation_.end());
+          }
+          DLOG(INFO) << "Looping around to first file.";
+        }
+        LoadHDF5FileData(
+            hdf_filenames_[file_permutation_[current_file_]].c_str());
+      }
+      current_row_ = 0;
+      if (this->layer_param_.hdf5_data_param().shuffle())
+        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+    }
+    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+      int data_dim = top[j]->count() / top[j]->shape(0);
+      OCL_CHECK(
+          clEnqueueWriteBuffer(amdDevice.CommandQueue,
+              (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
+              i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
+              &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+                  * data_dim], 0, NULL, NULL));
+      //caffe_copy(data_dim,
+      //    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+      //      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index a8c062bc..f9215a3d 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -13,94 +13,94 @@ namespace caffe {
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	file_name_ = this->layer_param_.hdf5_output_param().file_name();
-	file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
-			H5P_DEFAULT);
-	CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
-	file_opened_ = true;
+    const vector<Blob<Dtype>*>& top) {
+  file_name_ = this->layer_param_.hdf5_output_param().file_name();
+  file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
+      H5P_DEFAULT);
+  CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
+  file_opened_ = true;
 }
 
 template <typename Dtype>
 HDF5OutputLayer<Dtype>::~HDF5OutputLayer<Dtype>() {
-	if (file_opened_) {
-		herr_t status = H5Fclose(file_id_);
-		CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_;
-	}
+  if (file_opened_) {
+    herr_t status = H5Fclose(file_id_);
+    CHECK_GE(status, 0) << "Failed to close HDF5 file " << file_name_;
+  }
 }
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::SaveBlobs() {
-	// TODO: no limit on the number of blobs
-	LOG(INFO) << "Saving HDF5 file " << file_name_;
-	CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
-			"data blob and label blob must have the same batch size";
-	hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
-	hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
-	LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
+  // TODO: no limit on the number of blobs
+  LOG(INFO) << "Saving HDF5 file " << file_name_;
+  CHECK_EQ(data_blob_.num(), label_blob_.num())
+      << "data blob and label blob must have the same batch size";
+  hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
+  hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
+  LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
 }
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_GE(bottom.size(), 2);
-	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			bottom[0]->height(), bottom[0]->width());
-	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-			bottom[1]->height(), bottom[1]->width());
-	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+      bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
-	for (int i = 0; i < bottom[0]->num(); ++i) {
-		caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim],
-				&data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-		caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim],
-				&label_blob_.mutable_cpu_data()[i * label_datum_dim]);
-	}
-	SaveBlobs();
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    caffe_copy(data_datum_dim, &bottom[0]->cpu_data()[i * data_datum_dim],
+        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
+    caffe_copy(label_datum_dim, &bottom[1]->cpu_data()[i * label_datum_dim],
+        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
+  }
+  SaveBlobs();
 }
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	return;
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  return;
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_GE(bottom.size(), 2);
-	CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-	data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			bottom[0]->height(), bottom[0]->width());
-	label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-			bottom[1]->height(), bottom[1]->width());
-	const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-	const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+      bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
-	for (int i = 0; i < bottom[0]->num(); ++i) {
-		OCL_CHECK(
-				clEnqueueReadBuffer(amdDevice.CommandQueue,
-						(cl_mem) bottom[0]->gpu_data(), CL_TRUE,
-						i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
-						&data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
-		OCL_CHECK(
-				clEnqueueReadBuffer(amdDevice.CommandQueue,
-						(cl_mem) bottom[1]->gpu_data(), CL_TRUE,
-						i * label_datum_dim * sizeof(Dtype),
-						sizeof(Dtype) * label_datum_dim,
-						&label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL,
-						NULL));
-	}
-	SaveBlobs();
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    OCL_CHECK(
+        clEnqueueReadBuffer(amdDevice.CommandQueue,
+            (cl_mem) bottom[0]->gpu_data(), CL_TRUE,
+            i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
+            &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
+    OCL_CHECK(
+        clEnqueueReadBuffer(amdDevice.CommandQueue,
+            (cl_mem) bottom[1]->gpu_data(), CL_TRUE,
+            i * label_datum_dim * sizeof(Dtype),
+            sizeof(Dtype) * label_datum_dim,
+            &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL,
+            NULL));
+  }
+  SaveBlobs();
 }
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	return;
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  return;
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index d415bd64..b2259859 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -12,68 +12,68 @@ namespace caffe {
 
 template <typename Dtype>
 void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	const Dtype* label = bottom[1]->cpu_data();
-	int num = bottom[0]->num();
-	int count = bottom[0]->count();
-	int dim = count / num;
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  const Dtype* label = bottom[1]->cpu_data();
+  int num = bottom[0]->num();
+  int count = bottom[0]->count();
+  int dim = count / num;
 
-	caffe_copy(count, bottom_data, bottom_diff);
-	for (int i = 0; i < num; ++i) {
-		bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
-	}
-	for (int i = 0; i < num; ++i) {
-		for (int j = 0; j < dim; ++j) {
-			bottom_diff[i * dim + j] = std::max(
-					Dtype(0), 1 + bottom_diff[i * dim + j]);
-		}
-	}
-	Dtype* loss = top[0]->mutable_cpu_data();
-	switch (this->layer_param_.hinge_loss_param().norm()) {
-		case HingeLossParameter_Norm_L1:
-			loss[0] = caffe_cpu_asum(count, bottom_diff) / num;
-			break;
-		case HingeLossParameter_Norm_L2:
-			loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num;
-			break;
-		default:
-			LOG(FATAL) << "Unknown Norm";
-	}
+  caffe_copy(count, bottom_data, bottom_diff);
+  for (int i = 0; i < num; ++i) {
+    bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+  }
+  for (int i = 0; i < num; ++i) {
+    for (int j = 0; j < dim; ++j) {
+      bottom_diff[i * dim + j] = std::max(Dtype(0),
+          1 + bottom_diff[i * dim + j]);
+    }
+  }
+  Dtype* loss = top[0]->mutable_cpu_data();
+  switch (this->layer_param_.hinge_loss_param().norm()) {
+  case HingeLossParameter_Norm_L1:
+    loss[0] = caffe_cpu_asum(count, bottom_diff) / num;
+    break;
+  case HingeLossParameter_Norm_L2:
+    loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num;
+    break;
+  default:
+    LOG(FATAL) << "Unknown Norm";
+  }
 }
 
 template <typename Dtype>
 void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const Dtype* label = bottom[1]->cpu_data();
-		int num = bottom[0]->num();
-		int count = bottom[0]->count();
-		int dim = count / num;
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const Dtype* label = bottom[1]->cpu_data();
+    int num = bottom[0]->num();
+    int count = bottom[0]->count();
+    int dim = count / num;
 
-		for (int i = 0; i < num; ++i) {
-			bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
-		}
+    for (int i = 0; i < num; ++i) {
+      bottom_diff[i * dim + static_cast<int>(label[i])] *= -1;
+    }
 
-		const Dtype loss_weight = top[0]->cpu_diff()[0];
-		switch (this->layer_param_.hinge_loss_param().norm()) {
-			case HingeLossParameter_Norm_L1:
-				caffe_cpu_sign(count, bottom_diff, bottom_diff);
-				caffe_scal(count, loss_weight / num, bottom_diff);
-				break;
-			case HingeLossParameter_Norm_L2:
-				caffe_scal(count, loss_weight * 2 / num, bottom_diff);
-				break;
-			default:
-				LOG(FATAL) << "Unknown Norm";
-		}
-	}
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    switch (this->layer_param_.hinge_loss_param().norm()) {
+    case HingeLossParameter_Norm_L1:
+      caffe_cpu_sign(count, bottom_diff, bottom_diff);
+      caffe_scal(count, loss_weight / num, bottom_diff);
+      break;
+    case HingeLossParameter_Norm_L2:
+      caffe_scal(count, loss_weight * 2 / num, bottom_diff);
+      break;
+    default:
+      LOG(FATAL) << "Unknown Norm";
+    }
+  }
 }
 
 INSTANTIATE_CLASS (HingeLossLayer);
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index a8ddc7fe..886782b9 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -9,104 +9,106 @@ namespace caffe {
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-	CHECK(!conv_param.has_kernel_size() !=
-			!(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-			<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-	CHECK(conv_param.has_kernel_size() ||
-			(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-			<< "For non-square filters both kernel_h and kernel_w are required.";
-	CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-			&& conv_param.has_pad_w())
-			|| (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-			<< "pad is pad OR pad_h and pad_w are required.";
-	CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-			&& conv_param.has_stride_w())
-			|| (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-			<< "Stride is stride OR stride_h and stride_w are required.";
-	if (conv_param.has_kernel_size()) {
-		kernel_h_ = kernel_w_ = conv_param.kernel_size();
-	} else {
-		kernel_h_ = conv_param.kernel_h();
-		kernel_w_ = conv_param.kernel_w();
-	}
-	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-	if (!conv_param.has_pad_h()) {
-		pad_h_ = pad_w_ = conv_param.pad();
-	} else {
-		pad_h_ = conv_param.pad_h();
-		pad_w_ = conv_param.pad_w();
-	}
-	if (!conv_param.has_stride_h()) {
-		stride_h_ = stride_w_ = conv_param.stride();
-	} else {
-		stride_h_ = conv_param.stride_h();
-		stride_w_ = conv_param.stride_w();
-	}
+    const vector<Blob<Dtype>*>& top) {
+  ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+  CHECK(
+      !conv_param.has_kernel_size()
+          != !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+  CHECK(
+      conv_param.has_kernel_size()
+          || (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+      << "For non-square filters both kernel_h and kernel_w are required.";
+  CHECK(
+      (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w())
+          || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK(
+      (!conv_param.has_stride() && conv_param.has_stride_h()
+          && conv_param.has_stride_w())
+          || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  if (conv_param.has_kernel_size()) {
+    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  } else {
+    kernel_h_ = conv_param.kernel_h();
+    kernel_w_ = conv_param.kernel_w();
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!conv_param.has_pad_h()) {
+    pad_h_ = pad_w_ = conv_param.pad();
+  } else {
+    pad_h_ = conv_param.pad_h();
+    pad_w_ = conv_param.pad_w();
+  }
+  if (!conv_param.has_stride_h()) {
+    stride_h_ = stride_w_ = conv_param.stride();
+  } else {
+    stride_h_ = conv_param.stride_h();
+    stride_w_ = conv_param.stride_w();
+  }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	channels_ = bottom[0]->channels();
-	height_ = bottom[0]->height();
-	width_ = bottom[0]->width();
-	top[0]->Reshape(
-			bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
-			(height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
-			(width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  top[0]->Reshape(bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
+      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	for (int n = 0; n < bottom[0]->num(); ++n) {
-		im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-				width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-				stride_h_, stride_w_, top_data + top[0]->offset(n));
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+        top_data + top[0]->offset(n));
+  }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	for (int n = 0; n < top[0]->num(); ++n) {
-		col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_,
-				stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+        bottom_diff + bottom[0]->offset(n));
+  }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	for (int n = 0; n < bottom[0]->num(); ++n) {
-		im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_,
-				width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-				stride_h_, stride_w_, top_data, top[0]->offset(n));
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, top_data,
+        top[0]->offset(n));
+  }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->gpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	for (int n = 0; n < top[0]->num(); ++n) {
-		col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
-				kernel_h_, kernel_w_, pad_h_, pad_w_,
-				stride_h_, stride_w_, bottom_diff, bottom[0]->offset(n));
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff,
+        bottom[0]->offset(n));
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 24ac8ffc..21957551 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -17,140 +17,141 @@ namespace caffe {
 
 template <typename Dtype>
 ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
-	this->JoinPrefetchThread();
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
 void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int new_height = this->layer_param_.image_data_param().new_height();
-	const int new_width = this->layer_param_.image_data_param().new_width();
-	const bool is_color = this->layer_param_.image_data_param().is_color();
-	string root_folder = this->layer_param_.image_data_param().root_folder();
-
-	CHECK((new_height == 0 && new_width == 0) ||
-			(new_height > 0 && new_width > 0)) << "Current implementation requires "
-			"new_height and new_width to be set at the same time.";
-	// Read the file with filenames and labels
-	const string& source = this->layer_param_.image_data_param().source();
-	LOG(INFO) << "Opening file " << source;
-	std::ifstream infile(source.c_str());
-	string filename;
-	int label;
-	while (infile >> filename >> label) {
-		lines_.push_back(std::make_pair(filename, label));
-	}
-
-	if (this->layer_param_.image_data_param().shuffle()) {
-		// randomly shuffle data
-		LOG(INFO) << "Shuffling data";
-		const unsigned int prefetch_rng_seed = caffe_rng_rand();
-		prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-		ShuffleImages();
-	}
-	LOG(INFO) << "A total of " << lines_.size() << " images.";
-
-	lines_id_ = 0;
-	// Check if we would need to randomly skip a few data points
-	if (this->layer_param_.image_data_param().rand_skip()) {
-		unsigned int skip = caffe_rng_rand() %
-				this->layer_param_.image_data_param().rand_skip();
-		LOG(INFO) << "Skipping first " << skip << " data points.";
-		CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
-		lines_id_ = skip;
-	}
-	// Read an image, and use it to initialize the top blob.
-	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-			new_height, new_width, is_color);
-	// Use data_transformer to infer the expected blob shape from a cv_image.
-	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-	this->transformed_data_.Reshape(top_shape);
-	// Reshape prefetch_data and top[0] according to the batch_size.
-	const int batch_size = this->layer_param_.image_data_param().batch_size();
-	top_shape[0] = batch_size;
-	this->prefetch_data_.Reshape(top_shape);
-	top[0]->ReshapeLike(this->prefetch_data_);
-
-	LOG(INFO) << "output data size: " << top[0]->num() << ","
-			<< top[0]->channels() << "," << top[0]->height() << ","
-			<< top[0]->width();
-	// label
-	vector<int> label_shape(1, batch_size);
-	top[1]->Reshape(label_shape);
-	this->prefetch_label_.Reshape(label_shape);
+    const vector<Blob<Dtype>*>& top) {
+  const int new_height = this->layer_param_.image_data_param().new_height();
+  const int new_width = this->layer_param_.image_data_param().new_width();
+  const bool is_color = this->layer_param_.image_data_param().is_color();
+  string root_folder = this->layer_param_.image_data_param().root_folder();
+
+  CHECK(
+      (new_height == 0 && new_width == 0) || (new_height > 0 && new_width > 0))
+      << "Current implementation requires "
+          "new_height and new_width to be set at the same time.";
+  // Read the file with filenames and labels
+  const string& source = this->layer_param_.image_data_param().source();
+  LOG(INFO) << "Opening file " << source;
+  std::ifstream infile(source.c_str());
+  string filename;
+  int label;
+  while (infile >> filename >> label) {
+    lines_.push_back(std::make_pair(filename, label));
+  }
+
+  if (this->layer_param_.image_data_param().shuffle()) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling data";
+    const unsigned int prefetch_rng_seed = caffe_rng_rand();
+    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+    ShuffleImages();
+  }
+  LOG(INFO) << "A total of " << lines_.size() << " images.";
+
+  lines_id_ = 0;
+  // Check if we would need to randomly skip a few data points
+  if (this->layer_param_.image_data_param().rand_skip()) {
+    unsigned int skip = caffe_rng_rand()
+        % this->layer_param_.image_data_param().rand_skip();
+    LOG(INFO) << "Skipping first " << skip << " data points.";
+    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
+    lines_id_ = skip;
+  }
+  // Read an image, and use it to initialize the top blob.
+  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+      new_height, new_width, is_color);
+  // Use data_transformer to infer the expected blob shape from a cv_image.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape prefetch_data and top[0] according to the batch_size.
+  const int batch_size = this->layer_param_.image_data_param().batch_size();
+  top_shape[0] = batch_size;
+  this->prefetch_data_.Reshape(top_shape);
+  top[0]->ReshapeLike(this->prefetch_data_);
+
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  vector<int> label_shape(1, batch_size);
+  top[1]->Reshape(label_shape);
+  this->prefetch_label_.Reshape(label_shape);
 }
 
 template <typename Dtype>
 void ImageDataLayer<Dtype>::ShuffleImages() {
-	caffe::rng_t* prefetch_rng =
-			static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-	shuffle(lines_.begin(), lines_.end(), prefetch_rng);
+  caffe::rng_t* prefetch_rng =
+      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
 }
 
 // This function is used to create a thread that prefetches the data.
 template <typename Dtype>
 void ImageDataLayer<Dtype>::InternalThreadEntry() {
-	CPUTimer batch_timer;
-	batch_timer.Start();
-	double read_time = 0;
-	double trans_time = 0;
-	CPUTimer timer;
-	CHECK(this->prefetch_data_.count());
-	CHECK(this->transformed_data_.count());
-	ImageDataParameter image_data_param = this->layer_param_.image_data_param();
-	const int batch_size = image_data_param.batch_size();
-	const int new_height = image_data_param.new_height();
-	const int new_width = image_data_param.new_width();
-	const bool is_color = image_data_param.is_color();
-	string root_folder = image_data_param.root_folder();
-
-	// Reshape according to the first image of each batch
-	// on single input batches allows for inputs of varying dimension.
-	cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-			new_height, new_width, is_color);
-	// Use data_transformer to infer the expected blob shape from a cv_img.
-	vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
-	this->transformed_data_.Reshape(top_shape);
-	// Reshape prefetch_data according to the batch_size.
-	top_shape[0] = batch_size;
-	this->prefetch_data_.Reshape(top_shape);
-
-	Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
-	Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
-
-	// datum scales
-	const int lines_size = lines_.size();
-	for (int item_id = 0; item_id < batch_size; ++item_id) {
-		// get a blob
-		timer.Start();
-		CHECK_GT(lines_size, lines_id_);
-		cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-				new_height, new_width, is_color);
-		CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
-		read_time += timer.MicroSeconds();
-		timer.Start();
-		// Apply transformations (mirror, crop...) to the image
-		int offset = this->prefetch_data_.offset(item_id);
-		this->transformed_data_.set_cpu_data(prefetch_data + offset);
-		this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
-		trans_time += timer.MicroSeconds();
-
-		prefetch_label[item_id] = lines_[lines_id_].second;
-		// go to the next iter
-		lines_id_++;
-		if (lines_id_ >= lines_size) {
-			// We have reached the end. Restart from the first.
-			DLOG(INFO) << "Restarting data prefetching from start.";
-			lines_id_ = 0;
-			if (this->layer_param_.image_data_param().shuffle()) {
-				ShuffleImages();
-			}
-		}
-	}
-	batch_timer.Stop();
-	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+  CPUTimer batch_timer;
+  batch_timer.Start();
+  double read_time = 0;
+  double trans_time = 0;
+  CPUTimer timer;
+  CHECK(this->prefetch_data_.count());
+  CHECK(this->transformed_data_.count());
+  ImageDataParameter image_data_param = this->layer_param_.image_data_param();
+  const int batch_size = image_data_param.batch_size();
+  const int new_height = image_data_param.new_height();
+  const int new_width = image_data_param.new_width();
+  const bool is_color = image_data_param.is_color();
+  string root_folder = image_data_param.root_folder();
+
+  // Reshape according to the first image of each batch
+  // on single input batches allows for inputs of varying dimension.
+  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+      new_height, new_width, is_color);
+  // Use data_transformer to infer the expected blob shape from a cv_img.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape prefetch_data according to the batch_size.
+  top_shape[0] = batch_size;
+  this->prefetch_data_.Reshape(top_shape);
+
+  Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
+
+  // datum scales
+  const int lines_size = lines_.size();
+  for (int item_id = 0; item_id < batch_size; ++item_id) {
+    // get a blob
+    timer.Start();
+    CHECK_GT(lines_size, lines_id_);
+    cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
+        new_height, new_width, is_color);
+    CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
+    read_time += timer.MicroSeconds();
+    timer.Start();
+    // Apply transformations (mirror, crop...) to the image
+    int offset = this->prefetch_data_.offset(item_id);
+    this->transformed_data_.set_cpu_data(prefetch_data + offset);
+    this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
+    trans_time += timer.MicroSeconds();
+
+    prefetch_label[item_id] = lines_[lines_id_].second;
+    // go to the next iter
+    lines_id_++;
+    if (lines_id_ >= lines_size) {
+      // We have reached the end. Restart from the first.
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      lines_id_ = 0;
+      if (this->layer_param_.image_data_param().shuffle()) {
+        ShuffleImages();
+      }
+    }
+  }
+  batch_timer.Stop();
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
 INSTANTIATE_CLASS (ImageDataLayer);
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 21414224..ffd2ab97 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -11,97 +11,96 @@
 namespace caffe {
 
 template <typename Dtype>
-void InfogainLossLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::LayerSetUp(bottom, top);
-	if (bottom.size() < 3) {
-		CHECK(this->layer_param_.infogain_loss_param().has_source())
-				<< "Infogain matrix source must be specified.";
-		BlobProto blob_proto;
-		ReadProtoFromBinaryFile(
-				this->layer_param_.infogain_loss_param().source(), &blob_proto);
-		infogain_.FromProto(blob_proto);
-	}
+void InfogainLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
+  if (bottom.size() < 3) {
+    CHECK(this->layer_param_.infogain_loss_param().has_source())
+        << "Infogain matrix source must be specified.";
+    BlobProto blob_proto;
+    ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(),
+        &blob_proto);
+    infogain_.FromProto(blob_proto);
+  }
 }
 
 template <typename Dtype>
-void InfogainLossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::Reshape(bottom, top);
-	Blob < Dtype > *infogain = NULL;
-	if (bottom.size() < 3) {
-		infogain = &infogain_;
-	} else {
-		infogain = bottom[2];
-	}
-	CHECK_EQ(bottom[1]->channels(), 1);
-	CHECK_EQ(bottom[1]->height(), 1);
-	CHECK_EQ(bottom[1]->width(), 1);
-	const int num = bottom[0]->num();
-	const int dim = bottom[0]->count() / num;
-	CHECK_EQ(infogain->num(), 1);
-	CHECK_EQ(infogain->channels(), 1);
-	CHECK_EQ(infogain->height(), dim);
-	CHECK_EQ(infogain->width(), dim);
+void InfogainLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  Blob < Dtype > *infogain = NULL;
+  if (bottom.size() < 3) {
+    infogain = &infogain_;
+  } else {
+    infogain = bottom[2];
+  }
+  CHECK_EQ(bottom[1]->channels(), 1);
+  CHECK_EQ(bottom[1]->height(), 1);
+  CHECK_EQ(bottom[1]->width(), 1);
+  const int num = bottom[0]->num();
+  const int dim = bottom[0]->count() / num;
+  CHECK_EQ(infogain->num(), 1);
+  CHECK_EQ(infogain->channels(), 1);
+  CHECK_EQ(infogain->height(), dim);
+  CHECK_EQ(infogain->width(), dim);
 }
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* bottom_label = bottom[1]->cpu_data();
-	const Dtype* infogain_mat = NULL;
-	if (bottom.size() < 3) {
-		infogain_mat = infogain_.cpu_data();
-	} else {
-		infogain_mat = bottom[2]->cpu_data();
-	}
-	int num = bottom[0]->num();
-	int dim = bottom[0]->count() / bottom[0]->num();
-	Dtype loss = 0;
-	for (int i = 0; i < num; ++i) {
-		int label = static_cast<int>(bottom_label[i]);
-		for (int j = 0; j < dim; ++j) {
-			Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-			loss -= infogain_mat[label * dim + j] * log(prob);
-		}
-	}
-	top[0]->mutable_cpu_data()[0] = loss / num;
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_label = bottom[1]->cpu_data();
+  const Dtype* infogain_mat = NULL;
+  if (bottom.size() < 3) {
+    infogain_mat = infogain_.cpu_data();
+  } else {
+    infogain_mat = bottom[2]->cpu_data();
+  }
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    int label = static_cast<int>(bottom_label[i]);
+    for (int j = 0; j < dim; ++j) {
+      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
+      loss -= infogain_mat[label * dim + j] * log(prob);
+    }
+  }
+  top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down.size() > 2 && propagate_down[2]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to infogain inputs.";
-	}
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		const Dtype* bottom_label = bottom[1]->cpu_data();
-		const Dtype* infogain_mat = NULL;
-		if (bottom.size() < 3) {
-			infogain_mat = infogain_.cpu_data();
-		} else {
-			infogain_mat = bottom[2]->cpu_data();
-		}
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		int num = bottom[0]->num();
-		int dim = bottom[0]->count() / bottom[0]->num();
-		const Dtype scale = -top[0]->cpu_diff()[0] / num;
-		for (int i = 0; i < num; ++i) {
-			const int label = static_cast<int>(bottom_label[i]);
-			for (int j = 0; j < dim; ++j) {
-				Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-				bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
-			}
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down.size() > 2 && propagate_down[2]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to infogain inputs.";
+  }
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* bottom_label = bottom[1]->cpu_data();
+    const Dtype* infogain_mat = NULL;
+    if (bottom.size() < 3) {
+      infogain_mat = infogain_.cpu_data();
+    } else {
+      infogain_mat = bottom[2]->cpu_data();
+    }
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    int num = bottom[0]->num();
+    int dim = bottom[0]->count() / bottom[0]->num();
+    const Dtype scale = -top[0]->cpu_diff()[0] / num;
+    for (int i = 0; i < num; ++i) {
+      const int label = static_cast<int>(bottom_label[i]);
+      for (int j = 0; j < dim; ++j) {
+        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
+        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+      }
+    }
+  }
 }
 
 INSTANTIATE_CLASS (InfogainLossLayer);
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 3beca42f..b9ae3370 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -11,159 +11,150 @@ namespace caffe {
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int num_output = this->layer_param_.inner_product_param().num_output();
-	bias_term_ = this->layer_param_.inner_product_param().bias_term();
-	N_ = num_output;
-	const int axis = bottom[0]->CanonicalAxisIndex(
-			this->layer_param_.inner_product_param().axis());
-	// Dimensions starting from "axis" are "flattened" into a single
-	// length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
-	// and axis == 1, N inner products with dimension CHW are performed.
-	K_ = bottom[0]->count(axis);
-	// Check if we need to set up the weights
-	if (this->blobs_.size() > 0) {
-		LOG(INFO) << "Skipping parameter initialization";
-	} else {
-		if (bias_term_) {
-			this->blobs_.resize(2);
-		} else {
-			this->blobs_.resize(1);
-		}
-		// Intialize the weight
-		vector<int> weight_shape(2);
-		weight_shape[0] = N_;
-		weight_shape[1] = K_;
-		this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
-		// fill the weights
-		shared_ptr < Filler<Dtype> > weight_filler(GetFiller < Dtype > (
-				this->layer_param_.inner_product_param().weight_filler()));
-		weight_filler->Fill(this->blobs_[0].get());
-		// If necessary, intiialize and fill the bias term
-		if (bias_term_) {
-			vector<int> bias_shape(1, N_);
-			this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-			shared_ptr < Filler<Dtype> > bias_filler(GetFiller < Dtype > (
-					this->layer_param_.inner_product_param().bias_filler()));
-			bias_filler->Fill(this->blobs_[1].get());
-		}
-	}  // parameter initialization
-	this->param_propagate_down_.resize(this->blobs_.size(), true);
+    const vector<Blob<Dtype>*>& top) {
+  const int num_output = this->layer_param_.inner_product_param().num_output();
+  bias_term_ = this->layer_param_.inner_product_param().bias_term();
+  N_ = num_output;
+  const int axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.inner_product_param().axis());
+  // Dimensions starting from "axis" are "flattened" into a single
+  // length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
+  // and axis == 1, N inner products with dimension CHW are performed.
+  K_ = bottom[0]->count(axis);
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Intialize the weight
+    vector<int> weight_shape(2);
+    weight_shape[0] = N_;
+    weight_shape[1] = K_;
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+    // fill the weights
+    shared_ptr < Filler<Dtype>
+        > weight_filler(
+            GetFiller < Dtype
+                > (this->layer_param_.inner_product_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, intiialize and fill the bias term
+    if (bias_term_) {
+      vector<int> bias_shape(1, N_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+      shared_ptr < Filler<Dtype>
+          > bias_filler(
+              GetFiller < Dtype
+                  > (this->layer_param_.inner_product_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }  // parameter initialization
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// Figure out the dimensions
-	const int axis = bottom[0]->CanonicalAxisIndex(
-			this->layer_param_.inner_product_param().axis());
-	const int new_K = bottom[0]->count(axis);
-	CHECK_EQ(K_, new_K)
-			<< "Input size incompatible with inner product parameters.";
-	// The first "axis" dimensions are independent inner products; the total
-	// number of these is M_, the product over these dimensions.
-	M_ = bottom[0]->count(0, axis);
-	// The top shape will be the bottom shape with the flattened axes dropped,
-	// and replaced by a single axis with dimension num_output (N_).
-	vector<int> top_shape = bottom[0]->shape();
-	top_shape.resize(axis + 1);
-	top_shape[axis] = N_;
-	top[0]->Reshape(top_shape);
-	// Set up the bias multiplier
-	if (bias_term_) {
-		vector<int> bias_shape(1, M_);
-		bias_multiplier_.Reshape(bias_shape);
-		caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
-	}
+    const vector<Blob<Dtype>*>& top) {
+  // Figure out the dimensions
+  const int axis = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.inner_product_param().axis());
+  const int new_K = bottom[0]->count(axis);
+  CHECK_EQ(K_, new_K)
+      << "Input size incompatible with inner product parameters.";
+  // The first "axis" dimensions are independent inner products; the total
+  // number of these is M_, the product over these dimensions.
+  M_ = bottom[0]->count(0, axis);
+  // The top shape will be the bottom shape with the flattened axes dropped,
+  // and replaced by a single axis with dimension num_output (N_).
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape.resize(axis + 1);
+  top_shape[axis] = N_;
+  top[0]->Reshape(top_shape);
+  // Set up the bias multiplier
+  if (bias_term_) {
+    vector<int> bias_shape(1, M_);
+    bias_multiplier_.Reshape(bias_shape);
+    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const Dtype* weight = this->blobs_[0]->cpu_data();
-	caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
-			bottom_data, weight, (Dtype) 0., top_data);
-	if (bias_term_) {
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1.,
-				bias_multiplier_.cpu_data(),
-				this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  caffe_cpu_gemm < Dtype
+      > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, weight, (Dtype) 0., top_data);
+  if (bias_term_) {
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.cpu_data(), this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
+  }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (this->param_propagate_down_[0]) {
-		const Dtype* top_diff = top[0]->cpu_diff();
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		// Gradient with respect to weight
-		caffe_cpu_gemm < Dtype > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
-				top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
-	}
-	if (bias_term_ && this->param_propagate_down_[1]) {
-		const Dtype* top_diff = top[0]->cpu_diff();
-		// Gradient with respect to bias
-		caffe_cpu_gemv < Dtype > (CblasTrans, M_, N_, (Dtype) 1., top_diff,
-				bias_multiplier_.cpu_data(), (Dtype) 1.,
-				this->blobs_[1]->mutable_cpu_diff());
-	}
-	if (propagate_down[0]) {
-		const Dtype* top_diff = top[0]->cpu_diff();
-		// Gradient with respect to bottom data
-		caffe_cpu_gemm < Dtype
-				> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
-						top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0.,
-						bottom[0]->mutable_cpu_diff());
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    // Gradient with respect to weight
+    caffe_cpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    // Gradient with respect to bias
+    caffe_cpu_gemv < Dtype
+        > (CblasTrans, M_, N_, (Dtype) 1., top_diff, bias_multiplier_.cpu_data(), (Dtype) 1., this->blobs_[1]->mutable_cpu_diff());
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    // Gradient with respect to bottom data
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., bottom[0]->mutable_cpu_diff());
+  }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const Dtype* weight = this->blobs_[0]->gpu_data();
-	caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1.,
-			bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
-	if (bias_term_) {
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1.,
-				bias_multiplier_.gpu_data(), 0,
-				this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  caffe_gpu_gemm < Dtype
+      > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
+  if (bias_term_) {
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.gpu_data(), 0, this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
+  }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (this->param_propagate_down_[0]) {
-		const Dtype* top_diff = top[0]->gpu_diff();
-		const Dtype* bottom_data = bottom[0]->gpu_data();
-		// Gradient with respect to weight
-		caffe_gpu_gemm < Dtype
-				> (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1.,
-						top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
-	}
-	if (bias_term_ && this->param_propagate_down_[1]) {
-		const Dtype* top_diff = top[0]->gpu_diff();
-		// Gradient with respect to bias
-		caffe_gpu_gemv < Dtype
-				> (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff,
-						(size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()),
-						(size_t) 0, (Dtype) 0., 1,
-						this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
-	}
-	if (propagate_down[0]) {
-		const Dtype* top_diff = top[0]->gpu_diff();
-		// Gradient with respect to bottom data
-		caffe_gpu_gemm < Dtype
-				> (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1.,
-						top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0.,
-						bottom[0]->mutable_gpu_diff(), 0);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    // Gradient with respect to weight
+    caffe_gpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bias
+    caffe_gpu_gemv < Dtype
+        > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, (size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 0., 1, this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bottom data
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 60b08d99..f6ace662 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -9,121 +9,121 @@ namespace caffe {
 
 template <typename Dtype>
 void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	const Dtype base = this->layer_param_.log_param().base();
-	if (base != Dtype(-1)) {
-		CHECK_GT(base, 0) << "base must be strictly positive.";
-	}
-	// If base == -1, interpret the base as e and set log_base = 1 exactly.
-	// Otherwise, calculate its log explicitly.
-	const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-	CHECK(!isnan(log_base))
-			<< "NaN result: log(base) = log(" << base << ") = " << log_base;
-	CHECK(!isinf(log_base))
-			<< "Inf result: log(base) = log(" << base << ") = " << log_base;
-	base_scale_ = Dtype(1) / log_base;
-	CHECK(!isnan(base_scale_))
-			<< "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
-	CHECK(!isinf(base_scale_))
-			<< "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
-	input_scale_ = this->layer_param_.log_param().scale();
-	input_shift_ = this->layer_param_.log_param().shift();
-	backward_num_scale_ = input_scale_ / log_base;
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  const Dtype base = this->layer_param_.log_param().base();
+  if (base != Dtype(-1)) {
+    CHECK_GT(base, 0) << "base must be strictly positive.";
+  }
+  // If base == -1, interpret the base as e and set log_base = 1 exactly.
+  // Otherwise, calculate its log explicitly.
+  const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
+  CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = "
+      << log_base;
+  CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = "
+      << log_base;
+  base_scale_ = Dtype(1) / log_base;
+  CHECK(!isnan(base_scale_)) << "NaN result: 1/log(base) = 1/log(" << base
+      << ") = " << base_scale_;
+  CHECK(!isinf(base_scale_)) << "Inf result: 1/log(base) = 1/log(" << base
+      << ") = " << base_scale_;
+  input_scale_ = this->layer_param_.log_param().scale();
+  input_shift_ = this->layer_param_.log_param().shift();
+  backward_num_scale_ = input_scale_ / log_base;
 }
 
 template <typename Dtype>
 void LogLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-		caffe_log(count, bottom_data, top_data);
-	} else {
-		caffe_copy(count, bottom_data, top_data);
-		if (input_scale_ != Dtype(1)) {
-			caffe_scal(count, input_scale_, top_data);
-		}
-		if (input_shift_ != Dtype(0)) {
-			caffe_add_scalar(count, input_shift_, top_data);
-		}
-		caffe_log(count, top_data, top_data);
-	}
-	if (base_scale_ != Dtype(1)) {
-		caffe_scal(count, base_scale_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+    caffe_log(count, bottom_data, top_data);
+  } else {
+    caffe_copy(count, bottom_data, top_data);
+    if (input_scale_ != Dtype(1)) {
+      caffe_scal(count, input_scale_, top_data);
+    }
+    if (input_shift_ != Dtype(0)) {
+      caffe_add_scalar(count, input_shift_, top_data);
+    }
+    caffe_log(count, top_data, top_data);
+  }
+  if (base_scale_ != Dtype(1)) {
+    caffe_scal(count, base_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	caffe_copy(count, bottom_data, bottom_diff);
-	if (input_scale_ != Dtype(1)) {
-		caffe_scal(count, input_scale_, bottom_diff);
-	}
-	if (input_shift_ != Dtype(0)) {
-		caffe_add_scalar(count, input_shift_, bottom_diff);
-	}
-	caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-	if (backward_num_scale_ != Dtype(1)) {
-		caffe_scal(count, backward_num_scale_, bottom_diff);
-	}
-	caffe_mul(count, top_diff, bottom_diff, bottom_diff);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  caffe_copy(count, bottom_data, bottom_diff);
+  if (input_scale_ != Dtype(1)) {
+    caffe_scal(count, input_scale_, bottom_diff);
+  }
+  if (input_shift_ != Dtype(0)) {
+    caffe_add_scalar(count, input_shift_, bottom_diff);
+  }
+  caffe_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+  if (backward_num_scale_ != Dtype(1)) {
+    caffe_scal(count, backward_num_scale_, bottom_diff);
+  }
+  caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
 template <typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-		caffe_gpu_log(count, bottom_data, top_data);
-	} else {
-		caffe_gpu_copy(count, bottom_data, top_data);
-		if (input_scale_ != Dtype(1)) {
-			caffe_gpu_scal(count, input_scale_, top_data);
-		}
-		if (input_shift_ != Dtype(0)) {
-			caffe_gpu_add_scalar(count, input_shift_, top_data);
-		}
-		caffe_gpu_log(count, top_data, top_data);
-	}
-	if (base_scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, base_scale_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+    caffe_gpu_log(count, bottom_data, top_data);
+  } else {
+    caffe_gpu_copy(count, bottom_data, top_data);
+    if (input_scale_ != Dtype(1)) {
+      caffe_gpu_scal(count, input_scale_, top_data);
+    }
+    if (input_shift_ != Dtype(0)) {
+      caffe_gpu_add_scalar(count, input_shift_, top_data);
+    }
+    caffe_gpu_log(count, top_data, top_data);
+  }
+  if (base_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, base_scale_, top_data);
+  }
 }
 
 template <typename Dtype>
 void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	const int count = bottom[0]->count();
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	const Dtype* top_diff = top[0]->gpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	caffe_gpu_copy(count, bottom_data, bottom_diff);
-	if (input_scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, input_scale_, bottom_diff);
-	}
-	if (input_shift_ != Dtype(0)) {
-		caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-	}
-	caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-	if (backward_num_scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-	}
-	caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  caffe_gpu_copy(count, bottom_data, bottom_diff);
+  if (input_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, input_scale_, bottom_diff);
+  }
+  if (input_shift_ != Dtype(0)) {
+    caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
+  }
+  caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+  if (backward_num_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
+  }
+  caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index f5da913a..64abbaa0 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -11,21 +11,21 @@
 namespace caffe {
 
 template <typename Dtype>
-void LossLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// LossLayers have a non-zero (1) loss by default.
-	if (this->layer_param_.loss_weight_size() == 0) {
-		this->layer_param_.add_loss_weight(Dtype(1));
-	}
+void LossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // LossLayers have a non-zero (1) loss by default.
+  if (this->layer_param_.loss_weight_size() == 0) {
+    this->layer_param_.add_loss_weight(Dtype(1));
+  }
 }
 
 template <typename Dtype>
-void LossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-			<< "The data and label should have the same number.";
-	vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
-	top[0]->Reshape(loss_shape);
+void LossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
+      << "The data and label should have the same number.";
+  vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
+  top[0]->Reshape(loss_shape);
 }
 
 INSTANTIATE_CLASS (LossLayer);
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 2dfcd645..00e554bd 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -10,303 +10,304 @@ namespace caffe {
 
 template <typename Dtype>
 void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	size_ = this->layer_param_.lrn_param().local_size();
-	CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
-	pre_pad_ = (size_ - 1) / 2;
-	alpha_ = this->layer_param_.lrn_param().alpha();
-	beta_ = this->layer_param_.lrn_param().beta();
-	k_ = this->layer_param_.lrn_param().k();
-	if (this->layer_param_.lrn_param().norm_region() ==
-			LRNParameter_NormRegion_WITHIN_CHANNEL) {
-		// Set up split_layer_ to use inputs in the numerator and denominator.
-		split_top_vec_.clear();
-		split_top_vec_.push_back(&product_input_);
-		split_top_vec_.push_back(&square_input_);
-		LayerParameter split_param;
-		split_layer_.reset(new SplitLayer<Dtype>(split_param));
-		split_layer_->SetUp(bottom, split_top_vec_);
-		// Set up square_layer_ to square the inputs.
-		square_bottom_vec_.clear();
-		square_top_vec_.clear();
-		square_bottom_vec_.push_back(&square_input_);
-		square_top_vec_.push_back(&square_output_);
-		LayerParameter square_param;
-		square_param.mutable_power_param()->set_power(Dtype(2));
-		square_layer_.reset(new PowerLayer<Dtype>(square_param));
-		square_layer_->SetUp(square_bottom_vec_, square_top_vec_);
-		// Set up pool_layer_ to sum over square neighborhoods of the input.
-		pool_top_vec_.clear();
-		pool_top_vec_.push_back(&pool_output_);
-		LayerParameter pool_param;
-		pool_param.mutable_pooling_param()->set_pool(
-				PoolingParameter_PoolMethod_AVE);
-		pool_param.mutable_pooling_param()->set_pad(pre_pad_);
-		pool_param.mutable_pooling_param()->set_kernel_size(size_);
-		pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
-		pool_layer_->SetUp(square_top_vec_, pool_top_vec_);
-		// Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
-		// the sum of a squared neighborhood (the output of pool_layer_).
-		power_top_vec_.clear();
-		power_top_vec_.push_back(&power_output_);
-		LayerParameter power_param;
-		power_param.mutable_power_param()->set_power(-beta_);
-		power_param.mutable_power_param()->set_scale(alpha_);
-		power_param.mutable_power_param()->set_shift(Dtype(1));
-		power_layer_.reset(new PowerLayer<Dtype>(power_param));
-		power_layer_->SetUp(pool_top_vec_, power_top_vec_);
-		// Set up a product_layer_ to compute outputs by multiplying inputs by the
-		// inverse demoninator computed by the power layer.
-		product_bottom_vec_.clear();
-		product_bottom_vec_.push_back(&product_input_);
-		product_bottom_vec_.push_back(&power_output_);
-		LayerParameter product_param;
-		EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param();
-		eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
-		product_layer_.reset(new EltwiseLayer<Dtype>(product_param));
-		product_layer_->SetUp(product_bottom_vec_, top);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  size_ = this->layer_param_.lrn_param().local_size();
+  CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
+  pre_pad_ = (size_ - 1) / 2;
+  alpha_ = this->layer_param_.lrn_param().alpha();
+  beta_ = this->layer_param_.lrn_param().beta();
+  k_ = this->layer_param_.lrn_param().k();
+  if (this->layer_param_.lrn_param().norm_region()
+      == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+    // Set up split_layer_ to use inputs in the numerator and denominator.
+    split_top_vec_.clear();
+    split_top_vec_.push_back(&product_input_);
+    split_top_vec_.push_back(&square_input_);
+    LayerParameter split_param;
+    split_layer_.reset(new SplitLayer<Dtype>(split_param));
+    split_layer_->SetUp(bottom, split_top_vec_);
+    // Set up square_layer_ to square the inputs.
+    square_bottom_vec_.clear();
+    square_top_vec_.clear();
+    square_bottom_vec_.push_back(&square_input_);
+    square_top_vec_.push_back(&square_output_);
+    LayerParameter square_param;
+    square_param.mutable_power_param()->set_power(Dtype(2));
+    square_layer_.reset(new PowerLayer<Dtype>(square_param));
+    square_layer_->SetUp(square_bottom_vec_, square_top_vec_);
+    // Set up pool_layer_ to sum over square neighborhoods of the input.
+    pool_top_vec_.clear();
+    pool_top_vec_.push_back(&pool_output_);
+    LayerParameter pool_param;
+    pool_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_AVE);
+    pool_param.mutable_pooling_param()->set_pad(pre_pad_);
+    pool_param.mutable_pooling_param()->set_kernel_size(size_);
+    pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
+    pool_layer_->SetUp(square_top_vec_, pool_top_vec_);
+    // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
+    // the sum of a squared neighborhood (the output of pool_layer_).
+    power_top_vec_.clear();
+    power_top_vec_.push_back(&power_output_);
+    LayerParameter power_param;
+    power_param.mutable_power_param()->set_power(-beta_);
+    power_param.mutable_power_param()->set_scale(alpha_);
+    power_param.mutable_power_param()->set_shift(Dtype(1));
+    power_layer_.reset(new PowerLayer<Dtype>(power_param));
+    power_layer_->SetUp(pool_top_vec_, power_top_vec_);
+    // Set up a product_layer_ to compute outputs by multiplying inputs by the
+    // inverse demoninator computed by the power layer.
+    product_bottom_vec_.clear();
+    product_bottom_vec_.push_back(&product_input_);
+    product_bottom_vec_.push_back(&power_output_);
+    LayerParameter product_param;
+    EltwiseParameter* eltwise_param = product_param.mutable_eltwise_param();
+    eltwise_param->set_operation(EltwiseParameter_EltwiseOp_PROD);
+    product_layer_.reset(new EltwiseLayer<Dtype>(product_param));
+    product_layer_->SetUp(product_bottom_vec_, top);
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	num_ = bottom[0]->num();
-	channels_ = bottom[0]->channels();
-	height_ = bottom[0]->height();
-	width_ = bottom[0]->width();
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			top[0]->Reshape(num_, channels_, height_, width_);
-			scale_.Reshape(num_, channels_, height_, width_);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			split_layer_->Reshape(bottom, split_top_vec_);
-			square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
-			pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
-			power_layer_->Reshape(pool_top_vec_, power_top_vec_);
-			product_layer_->Reshape(product_bottom_vec_, top);
-			break;
-	}
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    top[0]->Reshape(num_, channels_, height_, width_);
+    scale_.Reshape(num_, channels_, height_, width_);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    split_layer_->Reshape(bottom, split_top_vec_);
+    square_layer_->Reshape(square_bottom_vec_, square_top_vec_);
+    pool_layer_->Reshape(square_top_vec_, pool_top_vec_);
+    power_layer_->Reshape(pool_top_vec_, power_top_vec_);
+    product_layer_->Reshape(product_bottom_vec_, top);
+    break;
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			CrossChannelForward_cpu(bottom, top);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			WithinChannelForward(bottom, top);
-			break;
-		default:
-			LOG(FATAL) << "Unknown normalization region.";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_cpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	Dtype* scale_data = scale_.mutable_cpu_data();
-	// start with the constant value
-	for (int i = 0; i < scale_.count(); ++i) {
-		scale_data[i] = k_;
-	}
-	Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_);
-	Dtype* padded_square_data = padded_square.mutable_cpu_data();
-	caffe_set(padded_square.count(), Dtype(0), padded_square_data);
-	Dtype alpha_over_size = alpha_ / size_;
-	// go through the images
-	for (int n = 0; n < num_; ++n) {
-		// compute the padded square
-		caffe_sqr(channels_ * height_ * width_,
-				bottom_data + bottom[0]->offset(n),
-				padded_square_data + padded_square.offset(0, pre_pad_));
-		// Create the first channel scale
-		for (int c = 0; c < size_; ++c) {
-			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
-					padded_square_data + padded_square.offset(0, c),
-					scale_data + scale_.offset(n, 0));
-		}
-		for (int c = 1; c < channels_; ++c) {
-			// copy previous scale
-			caffe_copy < Dtype > (height_ * width_,
-					scale_data + scale_.offset(n, c - 1),
-					scale_data + scale_.offset(n, c));
-			// add head
-			caffe_axpy < Dtype > (height_ * width_, alpha_over_size,
-					padded_square_data + padded_square.offset(0, c + size_ - 1),
-					scale_data + scale_.offset(n, c));
-			// subtract tail
-			caffe_axpy < Dtype > (height_ * width_, -alpha_over_size,
-					padded_square_data + padded_square.offset(0, c - 1),
-					scale_data + scale_.offset(n, c));
-		}
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  // start with the constant value
+  for (int i = 0; i < scale_.count(); ++i) {
+    scale_data[i] = k_;
+  }
+  Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_);
+  Dtype* padded_square_data = padded_square.mutable_cpu_data();
+  caffe_set(padded_square.count(), Dtype(0), padded_square_data);
+  Dtype alpha_over_size = alpha_ / size_;
+  // go through the images
+  for (int n = 0; n < num_; ++n) {
+    // compute the padded square
+    caffe_sqr(channels_ * height_ * width_, bottom_data + bottom[0]->offset(n),
+        padded_square_data + padded_square.offset(0, pre_pad_));
+    // Create the first channel scale
+    for (int c = 0; c < size_; ++c) {
+      caffe_axpy < Dtype
+          > (height_ * width_, alpha_over_size, padded_square_data
+              + padded_square.offset(0, c), scale_data + scale_.offset(n, 0));
+    }
+    for (int c = 1; c < channels_; ++c) {
+      // copy previous scale
+      caffe_copy < Dtype
+          > (height_ * width_, scale_data + scale_.offset(n, c - 1), scale_data
+              + scale_.offset(n, c));
+      // add head
+      caffe_axpy < Dtype
+          > (height_ * width_, alpha_over_size, padded_square_data
+              + padded_square.offset(0, c + size_ - 1), scale_data
+              + scale_.offset(n, c));
+      // subtract tail
+      caffe_axpy < Dtype
+          > (height_ * width_, -alpha_over_size, padded_square_data
+              + padded_square.offset(0, c - 1), scale_data + scale_.offset(n, c));
+    }
+  }
 
-	// In the end, compute output
-	caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data);
-	caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data);
+  // In the end, compute output
+  caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data);
+  caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data);
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::WithinChannelForward(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	split_layer_->Forward(bottom, split_top_vec_);
-	square_layer_->Forward(square_bottom_vec_, square_top_vec_);
-	pool_layer_->Forward(square_top_vec_, pool_top_vec_);
-	power_layer_->Forward(pool_top_vec_, power_top_vec_);
-	product_layer_->Forward(product_bottom_vec_, top);
+void LRNLayer<Dtype>::WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  split_layer_->Forward(bottom, split_top_vec_);
+  square_layer_->Forward(square_bottom_vec_, square_top_vec_);
+  pool_layer_->Forward(square_top_vec_, pool_top_vec_);
+  power_layer_->Forward(pool_top_vec_, power_top_vec_);
+  product_layer_->Forward(product_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			CrossChannelBackward_cpu(top, propagate_down, bottom);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			WithinChannelBackward(top, propagate_down, bottom);
-			break;
-		default:
-			LOG(FATAL) << "Unknown normalization region.";
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_cpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_cpu(
-		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->cpu_diff();
-	const Dtype* top_data = top[0]->cpu_data();
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* scale_data = scale_.cpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_);
-	Blob < Dtype > accum_ratio(1, 1, height_, width_);
-	Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
-	Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
-	// We hack a little bit by using the diff() to store an additional result
-	Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
-	caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
-	Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
+void LRNLayer<Dtype>::CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* scale_data = scale_.cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_);
+  Blob < Dtype > accum_ratio(1, 1, height_, width_);
+  Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
+  Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
+  // We hack a little bit by using the diff() to store an additional result
+  Dtype* accum_ratio_times_bottom = accum_ratio.mutable_cpu_diff();
+  caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
+  Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
 
-	caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff);
-	caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff);
+  caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff);
+  caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff);
 
-	// go through individual data
-	int inverse_pre_pad = size_ - (size_ + 1) / 2;
-	for (int n = 0; n < num_; ++n) {
-		int block_offset = scale_.offset(n);
-		// first, compute diff_i * y_i / s_i
-		caffe_mul < Dtype > (channels_ * height_ * width_,
-				top_diff + block_offset, top_data + block_offset,
-				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
-		caffe_div < Dtype > (channels_ * height_ * width_,
-				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
-				scale_data + block_offset,
-				padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
-		// Now, compute the accumulated ratios and the bottom diff
-		caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
-		for (int c = 0; c < size_ - 1; ++c) {
-			caffe_axpy < Dtype > (height_ * width_, 1.,
-					padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
-		}
-		for (int c = 0; c < channels_; ++c) {
-			caffe_axpy < Dtype > (height_ * width_, 1.,
-					padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
-					accum_ratio_data);
-			// compute bottom diff
-			caffe_mul < Dtype > (height_ * width_,
-					bottom_data + top[0]->offset(n, c),
-					accum_ratio_data, accum_ratio_times_bottom);
-			caffe_axpy < Dtype > (height_ * width_, -cache_ratio_value,
-					accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
-			caffe_axpy < Dtype > (height_ * width_, -1.,
-					padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
-		}
-	}
+  // go through individual data
+  int inverse_pre_pad = size_ - (size_ + 1) / 2;
+  for (int n = 0; n < num_; ++n) {
+    int block_offset = scale_.offset(n);
+    // first, compute diff_i * y_i / s_i
+    caffe_mul < Dtype
+        > (channels_ * height_ * width_, top_diff + block_offset, top_data
+            + block_offset, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad));
+    caffe_div < Dtype
+        > (channels_ * height_ * width_, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad), scale_data
+            + block_offset, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad));
+    // Now, compute the accumulated ratios and the bottom diff
+    caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
+    for (int c = 0; c < size_ - 1; ++c) {
+      caffe_axpy < Dtype
+          > (height_ * width_, 1., padded_ratio_data
+              + padded_ratio.offset(0, c), accum_ratio_data);
+    }
+    for (int c = 0; c < channels_; ++c) {
+      caffe_axpy < Dtype
+          > (height_ * width_, 1., padded_ratio_data
+              + padded_ratio.offset(0, c + size_ - 1), accum_ratio_data);
+      // compute bottom diff
+      caffe_mul < Dtype
+          > (height_ * width_, bottom_data + top[0]->offset(n, c), accum_ratio_data, accum_ratio_times_bottom);
+      caffe_axpy < Dtype
+          > (height_ * width_, -cache_ratio_value, accum_ratio_times_bottom, bottom_diff
+              + top[0]->offset(n, c));
+      caffe_axpy < Dtype
+          > (height_ * width_, -1., padded_ratio_data
+              + padded_ratio.offset(0, c), accum_ratio_data);
+    }
+  }
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::WithinChannelBackward(
-		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		vector<bool> product_propagate_down(2, true);
-		product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
-		power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
-		pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
-		square_layer_->Backward(square_top_vec_, propagate_down,
-				square_bottom_vec_);
-		split_layer_->Backward(split_top_vec_, propagate_down, bottom);
-	}
+void LRNLayer<Dtype>::WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    vector<bool> product_propagate_down(2, true);
+    product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
+    power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
+    pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
+    square_layer_->Backward(square_top_vec_, propagate_down,
+        square_bottom_vec_);
+    split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// First, compute scale
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	Dtype* scale_data = scale_.mutable_gpu_data();
-	// We will launch one kernel for each pixel location, and have the kernel
-	// go through all the channels.
-	int n_threads = num_ * height_ * width_;
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
-			alpha_ / size_, k_, scale_data);
-	n_threads = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // First, compute scale
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  // We will launch one kernel for each pixel location, and have the kernel
+  // go through all the channels.
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
+      alpha_ / size_, k_, scale_data);
+  n_threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	int n_threads = num_ * height_ * width_;
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-			scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-			size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-			bottom[0]->mutable_gpu_diff());
+void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      bottom[0]->mutable_gpu_diff());
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			CrossChannelForward_gpu(bottom, top);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			WithinChannelForward(bottom, top);
-			break;
-		default:
-			LOG(FATAL) << "Unknown normalization region.";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_gpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	switch (this->layer_param_.lrn_param().norm_region()) {
-		case LRNParameter_NormRegion_ACROSS_CHANNELS:
-			CrossChannelBackward_gpu(top, propagate_down, bottom);
-			break;
-		case LRNParameter_NormRegion_WITHIN_CHANNEL:
-			WithinChannelBackward(top, propagate_down, bottom);
-			break;
-		default:
-			LOG(FATAL) << "Unknown normalization region.";
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_gpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
 }
 #ifdef CPU_ONLY
 STUB_GPU(LRNLayer);
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index e3b12908..eff0129c 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -10,109 +10,109 @@ namespace caffe {
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	batch_size_ = this->layer_param_.memory_data_param().batch_size();
-	channels_ = this->layer_param_.memory_data_param().channels();
-	height_ = this->layer_param_.memory_data_param().height();
-	width_ = this->layer_param_.memory_data_param().width();
-	size_ = channels_ * height_ * width_;
-	CHECK_GT(batch_size_ * size_, 0) <<
-			"batch_size, channels, height, and width must be specified and"
-					" positive in memory_data_param";
-	vector<int> label_shape(1, batch_size_);
-	top[0]->Reshape(batch_size_, channels_, height_, width_);
-	top[1]->Reshape(label_shape);
-	added_data_.Reshape(batch_size_, channels_, height_, width_);
-	added_label_.Reshape(label_shape);
-	data_ = NULL;
-	labels_ = NULL;
-	added_data_.cpu_data();
-	added_label_.cpu_data();
+    const vector<Blob<Dtype>*>& top) {
+  batch_size_ = this->layer_param_.memory_data_param().batch_size();
+  channels_ = this->layer_param_.memory_data_param().channels();
+  height_ = this->layer_param_.memory_data_param().height();
+  width_ = this->layer_param_.memory_data_param().width();
+  size_ = channels_ * height_ * width_;
+  CHECK_GT(batch_size_ * size_, 0)
+      << "batch_size, channels, height, and width must be specified and"
+          " positive in memory_data_param";
+  vector<int> label_shape(1, batch_size_);
+  top[0]->Reshape(batch_size_, channels_, height_, width_);
+  top[1]->Reshape(label_shape);
+  added_data_.Reshape(batch_size_, channels_, height_, width_);
+  added_label_.Reshape(label_shape);
+  data_ = NULL;
+  labels_ = NULL;
+  added_data_.cpu_data();
+  added_label_.cpu_data();
 }
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
-	CHECK(!has_new_data_) <<
-			"Can't add data until current data has been consumed.";
-	size_t num = datum_vector.size();
-	CHECK_GT(num, 0) << "There is no datum to add.";
-	CHECK_EQ(num % batch_size_, 0) <<
-			"The added data must be a multiple of the batch size.";
-	added_data_.Reshape(num, channels_, height_, width_);
-	added_label_.Reshape(num, 1, 1, 1);
-	// Apply data transformations (mirror, scale, crop...)
-	this->data_transformer_->Transform(datum_vector, &added_data_);
-	// Copy Labels
-	Dtype* top_label = added_label_.mutable_cpu_data();
-	for (int item_id = 0; item_id < num; ++item_id) {
-		top_label[item_id] = datum_vector[item_id].label();
-	}
-	// num_images == batch_size_
-	Dtype* top_data = added_data_.mutable_cpu_data();
-	Reset(top_data, top_label, num);
-	has_new_data_ = true;
+  CHECK(!has_new_data_)
+      << "Can't add data until current data has been consumed.";
+  size_t num = datum_vector.size();
+  CHECK_GT(num, 0) << "There is no datum to add.";
+  CHECK_EQ(num % batch_size_, 0)
+      << "The added data must be a multiple of the batch size.";
+  added_data_.Reshape(num, channels_, height_, width_);
+  added_label_.Reshape(num, 1, 1, 1);
+  // Apply data transformations (mirror, scale, crop...)
+  this->data_transformer_->Transform(datum_vector, &added_data_);
+  // Copy Labels
+  Dtype* top_label = added_label_.mutable_cpu_data();
+  for (int item_id = 0; item_id < num; ++item_id) {
+    top_label[item_id] = datum_vector[item_id].label();
+  }
+  // num_images == batch_size_
+  Dtype* top_data = added_data_.mutable_cpu_data();
+  Reset(top_data, top_label, num);
+  has_new_data_ = true;
 }
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
-		const vector<int>& labels) {
-	size_t num = mat_vector.size();
-	CHECK(!has_new_data_) <<
-			"Can't add mat until current data has been consumed.";
-	CHECK_GT(num, 0) << "There is no mat to add";
-	CHECK_EQ(num % batch_size_, 0) <<
-			"The added data must be a multiple of the batch size.";
-	added_data_.Reshape(num, channels_, height_, width_);
-	added_label_.Reshape(num, 1, 1, 1);
-	// Apply data transformations (mirror, scale, crop...)
-	this->data_transformer_->Transform(mat_vector, &added_data_);
-	// Copy Labels
-	Dtype* top_label = added_label_.mutable_cpu_data();
-	for (int item_id = 0; item_id < num; ++item_id) {
-		top_label[item_id] = labels[item_id];
-	}
-	// num_images == batch_size_
-	Dtype* top_data = added_data_.mutable_cpu_data();
-	Reset(top_data, top_label, num);
-	has_new_data_ = true;
+    const vector<int>& labels) {
+  size_t num = mat_vector.size();
+  CHECK(!has_new_data_)
+      << "Can't add mat until current data has been consumed.";
+  CHECK_GT(num, 0) << "There is no mat to add";
+  CHECK_EQ(num % batch_size_, 0)
+      << "The added data must be a multiple of the batch size.";
+  added_data_.Reshape(num, channels_, height_, width_);
+  added_label_.Reshape(num, 1, 1, 1);
+  // Apply data transformations (mirror, scale, crop...)
+  this->data_transformer_->Transform(mat_vector, &added_data_);
+  // Copy Labels
+  Dtype* top_label = added_label_.mutable_cpu_data();
+  for (int item_id = 0; item_id < num; ++item_id) {
+    top_label[item_id] = labels[item_id];
+  }
+  // num_images == batch_size_
+  Dtype* top_data = added_data_.mutable_cpu_data();
+  Reset(top_data, top_label, num);
+  has_new_data_ = true;
 }
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
-	CHECK(data);
-	CHECK(labels);
-	CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size";
-	// Warn with transformation parameters since a memory array is meant to
-	// be generic and no transformations are done with Reset().
-	if (this->layer_param_.has_transform_param()) {
-		LOG(WARNING) << this->type() << " does not transform array data on Reset()";
-	}
-	data_ = data;
-	labels_ = labels;
-	n_ = n;
-	pos_ = 0;
+  CHECK(data);
+  CHECK(labels);
+  CHECK_EQ(n % batch_size_, 0) << "n must be a multiple of batch size";
+  // Warn with transformation parameters since a memory array is meant to
+  // be generic and no transformations are done with Reset().
+  if (this->layer_param_.has_transform_param()) {
+    LOG(WARNING) << this->type() << " does not transform array data on Reset()";
+  }
+  data_ = data;
+  labels_ = labels;
+  n_ = n;
+  pos_ = 0;
 }
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
-	CHECK(!has_new_data_) <<
-			"Can't change batch_size until current data has been consumed.";
-	batch_size_ = new_size;
-	added_data_.Reshape(batch_size_, channels_, height_, width_);
-	added_label_.Reshape(batch_size_, 1, 1, 1);
+  CHECK(!has_new_data_)
+      << "Can't change batch_size until current data has been consumed.";
+  batch_size_ = new_size;
+  added_data_.Reshape(batch_size_, channels_, height_, width_);
+  added_label_.Reshape(batch_size_, 1, 1, 1);
 }
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
-	top[0]->Reshape(batch_size_, channels_, height_, width_);
-	top[1]->Reshape(batch_size_, 1, 1, 1);
-	top[0]->set_cpu_data(data_ + pos_ * size_);
-	top[1]->set_cpu_data(labels_ + pos_);
-	pos_ = (pos_ + batch_size_) % n_;
-	if (pos_ == 0)
-		has_new_data_ = false;
+    const vector<Blob<Dtype>*>& top) {
+  CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
+  top[0]->Reshape(batch_size_, channels_, height_, width_);
+  top[1]->Reshape(batch_size_, 1, 1, 1);
+  top[0]->set_cpu_data(data_ + pos_ * size_);
+  top[1]->set_cpu_data(labels_ + pos_);
+  pos_ = (pos_ + batch_size_) % n_;
+  if (pos_ == 0)
+    has_new_data_ = false;
 }
 
 INSTANTIATE_CLASS (MemoryDataLayer);
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 358ed891..4d8b69bc 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -12,53 +12,52 @@ namespace caffe {
 
 template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::Reshape(bottom, top);
-	CHECK_EQ(bottom[1]->channels(), 1);
-	CHECK_EQ(bottom[1]->height(), 1);
-	CHECK_EQ(bottom[1]->width(), 1);
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  CHECK_EQ(bottom[1]->channels(), 1);
+  CHECK_EQ(bottom[1]->height(), 1);
+  CHECK_EQ(bottom[1]->width(), 1);
 }
 
 template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* bottom_label = bottom[1]->cpu_data();
-	int num = bottom[0]->num();
-	int dim = bottom[0]->count() / bottom[0]->num();
-	Dtype loss = 0;
-	for (int i = 0; i < num; ++i) {
-		int label = static_cast<int>(bottom_label[i]);
-		Dtype prob = std::max(
-				bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
-		loss -= log(prob);
-	}
-	top[0]->mutable_cpu_data()[0] = loss / num;
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_label = bottom[1]->cpu_data();
+  int num = bottom[0]->num();
+  int dim = bottom[0]->count() / bottom[0]->num();
+  Dtype loss = 0;
+  for (int i = 0; i < num; ++i) {
+    int label = static_cast<int>(bottom_label[i]);
+    Dtype prob = std::max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+    loss -= log(prob);
+  }
+  top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
 template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
-		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		const Dtype* bottom_label = bottom[1]->cpu_data();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		int num = bottom[0]->num();
-		int dim = bottom[0]->count() / bottom[0]->num();
-		caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
-		const Dtype scale = -top[0]->cpu_diff()[0] / num;
-		for (int i = 0; i < num; ++i) {
-			int label = static_cast<int>(bottom_label[i]);
-			Dtype prob = std::max(
-					bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
-			bottom_diff[i * dim + label] = scale / prob;
-		}
-	}
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* bottom_label = bottom[1]->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    int num = bottom[0]->num();
+    int dim = bottom[0]->count() / bottom[0]->num();
+    caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
+    const Dtype scale = -top[0]->cpu_diff()[0] / num;
+    for (int i = 0; i < num; ++i) {
+      int label = static_cast<int>(bottom_label[i]);
+      Dtype prob = std::max(bottom_data[i * dim + label],
+          Dtype(kLOG_THRESHOLD));
+      bottom_diff[i * dim + label] = scale / prob;
+    }
+  }
 }
 
 INSTANTIATE_CLASS (MultinomialLogisticLossLayer);
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 0a6613d7..64c3063f 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -9,245 +9,223 @@ namespace caffe {
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-			bottom[0]->height(), bottom[0]->width());
-	mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			1, 1);
-	variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			1, 1);
-	temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-			bottom[0]->height(), bottom[0]->width());
-	sum_multiplier_.Reshape(1, 1,
-			bottom[0]->height(), bottom[0]->width());
-	Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-	caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
-	eps_ = this->layer_param_.mvn_param().eps();
+    const vector<Blob<Dtype>*>& top) {
+  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(),
+      bottom[0]->width());
+  mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(),
+      bottom[0]->width());
+  sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
+  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
+  eps_ = this->layer_param_.mvn_param().eps();
 }
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	int num;
-	if (this->layer_param_.mvn_param().across_channels())
-		num = bottom[0]->num();
-	else
-		num = bottom[0]->num() * bottom[0]->channels();
-
-	int dim = bottom[0]->count() / num;
-
-	if (this->layer_param_.mvn_param().normalize_variance()) {
-		// put the squares of bottom into temp_
-		caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
-				temp_.mutable_cpu_data());
-
-		// computes variance using var(X) = E(X^2) - (EX)^2
-		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-		caffe_cpu_gemv < Dtype
-				> (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
-						sum_multiplier_.cpu_data(), 0.,
-						variance_.mutable_cpu_data());  // E(X^2)
-		caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
-				temp_.mutable_cpu_data());  // (EX)^2
-		caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
-				variance_.mutable_cpu_data());  // variance
-
-		// do mean and variance normalization
-		// subtract mean
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-				temp_.mutable_cpu_data());
-
-		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
-
-		// normalize variance
-		caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
-				variance_.mutable_cpu_data());
-
-		caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-				temp_.mutable_cpu_data());
-
-		caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
-	} else {
-		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-
-		// subtract mean
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-				temp_.mutable_cpu_data());
-
-		caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // put the squares of bottom into temp_
+    caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
+        temp_.mutable_cpu_data());
+
+    // computes variance using var(X) = E(X^2) - (EX)^2
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), sum_multiplier_.cpu_data(), 0., variance_.mutable_cpu_data()); // E(X^2)
+    caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
+        temp_.mutable_cpu_data());  // (EX)^2
+    caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
+        variance_.mutable_cpu_data());  // variance
+
+    // do mean and variance normalization
+    // subtract mean
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
+
+    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
+
+    // normalize variance
+    caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+        variance_.mutable_cpu_data());
+
+    caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
+
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
+
+    caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
+  } else {
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX
+
+    // subtract mean
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
+
+    caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
+  }
 }
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->cpu_diff();
-	const Dtype* top_data = top[0]->cpu_data();
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-
-	int num;
-	if (this->layer_param_.mvn_param().across_channels())
-		num = bottom[0]->num();
-	else
-		num = bottom[0]->num() * bottom[0]->channels();
-
-	int dim = bottom[0]->count() / num;
-
-	if (this->layer_param_.mvn_param().normalize_variance()) {
-		caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
-		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
-				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-				bottom_diff);
-		caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-		caffe_cpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
-				sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
-				bottom_diff);
-
-		caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-				bottom_diff);
-
-		// put the squares of bottom into temp_
-		caffe_powx(temp_.count(), bottom_data, Dtype(2),
-				temp_.mutable_cpu_data());
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-				temp_.mutable_cpu_data());
-
-		caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
-	} else {
-		caffe_copy(temp_.count(), top_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., bottom_diff);
+    caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., bottom_diff);
+
+    caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+        bottom_diff);
+
+    // put the squares of bottom into temp_
+    caffe_powx(temp_.count(), bottom_data, Dtype(2), temp_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
+
+    caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
+  } else {
+    caffe_copy(temp_.count(), top_diff, bottom_diff);
+  }
 }
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	int num;
-	if (this->layer_param_.mvn_param().across_channels())
-		num = bottom[0]->num();
-	else
-		num = bottom[0]->num() * bottom[0]->channels();
-
-	int dim = bottom[0]->count() / num;
-
-	if (this->layer_param_.mvn_param().normalize_variance()) {
-		// put the squares of bottom into temp_
-		caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-				temp_.mutable_gpu_data());
-
-		// computes variance using var(X) = E(X^2) - (EX)^2
-		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-		caffe_gpu_gemv < Dtype
-				> (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-						sum_multiplier_.gpu_data(), 0.,
-						variance_.mutable_gpu_data());  // E(X^2)
-		caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-				temp_.mutable_gpu_data());  // (EX)^2
-		caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-				variance_.mutable_gpu_data());  // variance
-
-		// do mean and variance normalization
-		// subtract mean
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				temp_.mutable_gpu_data());
-
-		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-
-		// normalize variance
-		caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-				variance_.mutable_gpu_data());
-
-		caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				temp_.mutable_gpu_data());
-
-		caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-	} else {
-		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, bottom_data,
-				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-		// subtract mean
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				temp_.mutable_gpu_data());
-
-		caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    // computes variance using var(X) = E(X^2) - (EX)^2
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), sum_multiplier_.gpu_data(), 0., variance_.mutable_gpu_data()); // E(X^2)
+    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
+        temp_.mutable_gpu_data());  // (EX)^2
+    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
+        variance_.mutable_gpu_data());  // variance
+
+    // do mean and variance normalization
+    // subtract mean
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+
+    // normalize variance
+    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+        variance_.mutable_gpu_data());
+
+    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
+
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
+  } else {
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX
+
+    // subtract mean
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+  }
 }
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->gpu_diff();
-	const Dtype* top_data = top[0]->gpu_data();
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-	int num;
-	if (this->layer_param_.mvn_param().across_channels())
-		num = bottom[0]->num();
-	else
-		num = bottom[0]->num() * bottom[0]->channels();
-
-	int dim = bottom[0]->count() / num;
-
-	if (this->layer_param_.mvn_param().normalize_variance()) {
-		caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., bottom_diff,
-				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				bottom_diff);
-		caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1., top_diff,
-				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-				bottom_diff);
-
-		caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-				bottom_diff);
-
-		// put the squares of bottom into temp_
-		caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-				temp_.mutable_gpu_data());
-
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-				variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				temp_.mutable_gpu_data());
-
-		caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-	} else {
-		caffe_gpu_gemv < Dtype > (CblasNoTrans, num, dim, 1. / dim, top_diff,
-				sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-		caffe_gpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-				mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-				temp_.mutable_gpu_data());
-		caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., bottom_diff);
+    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., bottom_diff);
+
+    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+        bottom_diff);
+
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
+  } else {
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp
index a9edeffd..4fa61aad 100644
--- a/src/caffe/layers/neuron_layer.cpp
+++ b/src/caffe/layers/neuron_layer.cpp
@@ -7,8 +7,8 @@ namespace caffe {
 
 template <typename Dtype>
 void NeuronLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	top[0]->ReshapeLike(*bottom[0]);
+    const vector<Blob<Dtype>*>& top) {
+  top[0]->ReshapeLike(*bottom[0]);
 }
 
 INSTANTIATE_CLASS (NeuronLayer);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 47830228..85c57379 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -15,397 +15,397 @@ using std::max;
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	PoolingParameter pool_param = this->layer_param_.pooling_param();
-	if (pool_param.global_pooling()) {
-		CHECK(!(pool_param.has_kernel_size() ||
-				pool_param.has_kernel_h() || pool_param.has_kernel_w()))
-				<< "With Global_pooling: true Filter size cannot specified";
-	} else {
-		CHECK(!pool_param.has_kernel_size() !=
-				!(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-				<< "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-		CHECK(pool_param.has_kernel_size() ||
-				(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-				<< "For non-square filters both kernel_h and kernel_w are required.";
-	}
-	CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
-			&& pool_param.has_pad_w())
-			|| (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
-			<< "pad is pad OR pad_h and pad_w are required.";
-	CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
-			&& pool_param.has_stride_w())
-			|| (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
-			<< "Stride is stride OR stride_h and stride_w are required.";
-	global_pooling_ = pool_param.global_pooling();
-	if (global_pooling_) {
-		kernel_h_ = bottom[0]->height();
-		kernel_w_ = bottom[0]->width();
-	} else {
-		if (pool_param.has_kernel_size()) {
-			kernel_h_ = kernel_w_ = pool_param.kernel_size();
-		} else {
-			kernel_h_ = pool_param.kernel_h();
-			kernel_w_ = pool_param.kernel_w();
-		}
-	}
-	CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-	CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-	if (!pool_param.has_pad_h()) {
-		pad_h_ = pad_w_ = pool_param.pad();
-	} else {
-		pad_h_ = pool_param.pad_h();
-		pad_w_ = pool_param.pad_w();
-	}
-	if (!pool_param.has_stride_h()) {
-		stride_h_ = stride_w_ = pool_param.stride();
-	} else {
-		stride_h_ = pool_param.stride_h();
-		stride_w_ = pool_param.stride_w();
-	}
-	if (global_pooling_) {
-		CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-				<< "With Global_pooling: true; only pad = 0 and stride = 1";
-	}
-	if (pad_h_ != 0 || pad_w_ != 0) {
-		CHECK(this->layer_param_.pooling_param().pool()
-				== PoolingParameter_PoolMethod_AVE
-				|| this->layer_param_.pooling_param().pool()
-						== PoolingParameter_PoolMethod_MAX)
-				<< "Padding implemented only for average and max pooling.";
-		CHECK_LT(pad_h_, kernel_h_);
-		CHECK_LT(pad_w_, kernel_w_);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  PoolingParameter pool_param = this->layer_param_.pooling_param();
+  if (pool_param.global_pooling()) {
+    CHECK(
+        !(pool_param.has_kernel_size() || pool_param.has_kernel_h()
+            || pool_param.has_kernel_w()))
+        << "With Global_pooling: true Filter size cannot specified";
+  } else {
+    CHECK(
+        !pool_param.has_kernel_size()
+            != !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+    CHECK(
+        pool_param.has_kernel_size()
+            || (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        << "For non-square filters both kernel_h and kernel_w are required.";
+  }
+  CHECK(
+      (!pool_param.has_pad() && pool_param.has_pad_h() && pool_param.has_pad_w())
+          || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
+      << "pad is pad OR pad_h and pad_w are required.";
+  CHECK(
+      (!pool_param.has_stride() && pool_param.has_stride_h()
+          && pool_param.has_stride_w())
+          || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
+      << "Stride is stride OR stride_h and stride_w are required.";
+  global_pooling_ = pool_param.global_pooling();
+  if (global_pooling_) {
+    kernel_h_ = bottom[0]->height();
+    kernel_w_ = bottom[0]->width();
+  } else {
+    if (pool_param.has_kernel_size()) {
+      kernel_h_ = kernel_w_ = pool_param.kernel_size();
+    } else {
+      kernel_h_ = pool_param.kernel_h();
+      kernel_w_ = pool_param.kernel_w();
+    }
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+  if (!pool_param.has_pad_h()) {
+    pad_h_ = pad_w_ = pool_param.pad();
+  } else {
+    pad_h_ = pool_param.pad_h();
+    pad_w_ = pool_param.pad_w();
+  }
+  if (!pool_param.has_stride_h()) {
+    stride_h_ = stride_w_ = pool_param.stride();
+  } else {
+    stride_h_ = pool_param.stride_h();
+    stride_w_ = pool_param.stride_w();
+  }
+  if (global_pooling_) {
+    CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+        << "With Global_pooling: true; only pad = 0 and stride = 1";
+  }
+  if (pad_h_ != 0 || pad_w_ != 0) {
+    CHECK(
+        this->layer_param_.pooling_param().pool()
+            == PoolingParameter_PoolMethod_AVE
+            || this->layer_param_.pooling_param().pool()
+                == PoolingParameter_PoolMethod_MAX)
+        << "Padding implemented only for average and max pooling.";
+    CHECK_LT(pad_h_, kernel_h_);
+    CHECK_LT(pad_w_, kernel_w_);
+  }
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	channels_ = bottom[0]->channels();
-	height_ = bottom[0]->height();
-	width_ = bottom[0]->width();
-	if (global_pooling_) {
-		kernel_h_ = bottom[0]->height();
-		kernel_w_ = bottom[0]->width();
-	}
-	pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-			height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-	pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-			width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
-	if (pad_h_ || pad_w_) {
-		// If we have padding, ensure that the last pooling starts strictly
-		// inside the image (instead of at the padding); otherwise clip the last.
-		if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
-			--pooled_height_;
-		}
-		if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
-			--pooled_width_;
-		}
-		CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
-		CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
-	}
-	top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
-			pooled_width_);
-	if (top.size() > 1) {
-		top[1]->ReshapeLike(*top[0]);
-	}
-	// If max pooling, we will initialize the vector index part.
-	if (this->layer_param_.pooling_param().pool() ==
-			PoolingParameter_PoolMethod_MAX && top.size() == 1) {
-		max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-				pooled_width_);
-	}
-	// If stochastic pooling, we will initialize the random index part.
-	if (this->layer_param_.pooling_param().pool() ==
-			PoolingParameter_PoolMethod_STOCHASTIC) {
-		rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-				pooled_width_);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  height_ = bottom[0]->height();
+  width_ = bottom[0]->width();
+  if (global_pooling_) {
+    kernel_h_ = bottom[0]->height();
+    kernel_w_ = bottom[0]->width();
+  }
+  pooled_height_ = static_cast<int>(ceil(
+      static_cast<float>(height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+  pooled_width_ = static_cast<int>(ceil(
+      static_cast<float>(width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+  if (pad_h_ || pad_w_) {
+    // If we have padding, ensure that the last pooling starts strictly
+    // inside the image (instead of at the padding); otherwise clip the last.
+    if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
+      --pooled_height_;
+    }
+    if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
+      --pooled_width_;
+    }
+    CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
+    CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
+  }
+  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_);
+  if (top.size() > 1) {
+    top[1]->ReshapeLike(*top[0]);
+  }
+  // If max pooling, we will initialize the vector index part.
+  if (this->layer_param_.pooling_param().pool()
+      == PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+    max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+        pooled_width_);
+  }
+  // If stochastic pooling, we will initialize the random index part.
+  if (this->layer_param_.pooling_param().pool()
+      == PoolingParameter_PoolMethod_STOCHASTIC) {
+    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+        pooled_width_);
+  }
 }
 
 // TODO(Yangqing): Is there a faster way to do pooling in the channel-first
 // case?
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int top_count = top[0]->count();
-	// We'll output the mask to top[1] if it's of size >1.
-	const bool use_top_mask = top.size() > 1;
-	int* mask = NULL;  // suppress warnings about uninitalized variables
-	Dtype* top_mask = NULL;
-	// Different pooling methods. We explicitly do the switch outside the for
-	// loop to save time, although this results in more code.
-	switch (this->layer_param_.pooling_param().pool()) {
-		case PoolingParameter_PoolMethod_MAX:
-			// Initialize
-			if (use_top_mask) {
-				top_mask = top[1]->mutable_cpu_data();
-				caffe_set(top_count, Dtype(-1), top_mask);
-			} else {
-				mask = max_idx_.mutable_cpu_data();
-				caffe_set(top_count, -1, mask);
-			}
-			caffe_set(top_count, Dtype(-FLT_MAX), top_data);
-			// The main loop
-			for (int n = 0; n < bottom[0]->num(); ++n) {
-				for (int c = 0; c < channels_; ++c) {
-					for (int ph = 0; ph < pooled_height_; ++ph) {
-						for (int pw = 0; pw < pooled_width_; ++pw) {
-							int hstart = ph * stride_h_ - pad_h_;
-							int wstart = pw * stride_w_ - pad_w_;
-							int hend = min(hstart + kernel_h_, height_);
-							int wend = min(wstart + kernel_w_, width_);
-							hstart = max(hstart, 0);
-							wstart = max(wstart, 0);
-							const int pool_index = ph * pooled_width_ + pw;
-							for (int h = hstart; h < hend; ++h) {
-								for (int w = wstart; w < wend; ++w) {
-									const int index = h * width_ + w;
-									if (bottom_data[index] > top_data[pool_index]) {
-										top_data[pool_index] = bottom_data[index];
-										if (use_top_mask) {
-											top_mask[pool_index] = static_cast<Dtype>(index);
-										} else {
-											mask[pool_index] = index;
-										}
-									}
-								}
-							}
-						}
-					}
-					// compute offset
-					bottom_data += bottom[0]->offset(0, 1);
-					top_data += top[0]->offset(0, 1);
-					if (use_top_mask) {
-						top_mask += top[0]->offset(0, 1);
-					} else {
-						mask += top[0]->offset(0, 1);
-					}
-				}
-			}
-			break;
-		case PoolingParameter_PoolMethod_AVE:
-			for (int i = 0; i < top_count; ++i) {
-				top_data[i] = 0;
-			}
-			// The main loop
-			for (int n = 0; n < bottom[0]->num(); ++n) {
-				for (int c = 0; c < channels_; ++c) {
-					for (int ph = 0; ph < pooled_height_; ++ph) {
-						for (int pw = 0; pw < pooled_width_; ++pw) {
-							int hstart = ph * stride_h_ - pad_h_;
-							int wstart = pw * stride_w_ - pad_w_;
-							int hend = min(hstart + kernel_h_, height_ + pad_h_);
-							int wend = min(wstart + kernel_w_, width_ + pad_w_);
-							int pool_size = (hend - hstart) * (wend - wstart);
-							hstart = max(hstart, 0);
-							wstart = max(wstart, 0);
-							hend = min(hend, height_);
-							wend = min(wend, width_);
-							for (int h = hstart; h < hend; ++h) {
-								for (int w = wstart; w < wend; ++w) {
-									top_data[ph * pooled_width_ + pw] +=
-											bottom_data[h * width_ + w];
-								}
-							}
-							top_data[ph * pooled_width_ + pw] /= pool_size;
-						}
-					}
-					// compute offset
-					bottom_data += bottom[0]->offset(0, 1);
-					top_data += top[0]->offset(0, 1);
-				}
-			}
-			break;
-		case PoolingParameter_PoolMethod_STOCHASTIC:
-			NOT_IMPLEMENTED;
-			break;
-		default:
-			LOG(FATAL) << "Unknown pooling method.";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int top_count = top[0]->count();
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  int* mask = NULL;  // suppress warnings about uninitalized variables
+  Dtype* top_mask = NULL;
+  // Different pooling methods. We explicitly do the switch outside the for
+  // loop to save time, although this results in more code.
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    // Initialize
+    if (use_top_mask) {
+      top_mask = top[1]->mutable_cpu_data();
+      caffe_set(top_count, Dtype(-1), top_mask);
+    } else {
+      mask = max_idx_.mutable_cpu_data();
+      caffe_set(top_count, -1, mask);
+    }
+    caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+    // The main loop
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_height_; ++ph) {
+          for (int pw = 0; pw < pooled_width_; ++pw) {
+            int hstart = ph * stride_h_ - pad_h_;
+            int wstart = pw * stride_w_ - pad_w_;
+            int hend = min(hstart + kernel_h_, height_);
+            int wend = min(wstart + kernel_w_, width_);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            const int pool_index = ph * pooled_width_ + pw;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width_ + w;
+                if (bottom_data[index] > top_data[pool_index]) {
+                  top_data[pool_index] = bottom_data[index];
+                  if (use_top_mask) {
+                    top_mask[pool_index] = static_cast<Dtype>(index);
+                  } else {
+                    mask[pool_index] = index;
+                  }
+                }
+              }
+            }
+          }
+        }
+        // compute offset
+        bottom_data += bottom[0]->offset(0, 1);
+        top_data += top[0]->offset(0, 1);
+        if (use_top_mask) {
+          top_mask += top[0]->offset(0, 1);
+        } else {
+          mask += top[0]->offset(0, 1);
+        }
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    for (int i = 0; i < top_count; ++i) {
+      top_data[i] = 0;
+    }
+    // The main loop
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_height_; ++ph) {
+          for (int pw = 0; pw < pooled_width_; ++pw) {
+            int hstart = ph * stride_h_ - pad_h_;
+            int wstart = pw * stride_w_ - pad_w_;
+            int hend = min(hstart + kernel_h_, height_ + pad_h_);
+            int wend = min(wstart + kernel_w_, width_ + pad_w_);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, height_);
+            wend = min(wend, width_);
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                top_data[ph * pooled_width_ + pw] +=
+                    bottom_data[h * width_ + w];
+              }
+            }
+            top_data[ph * pooled_width_ + pw] /= pool_size;
+          }
+        }
+        // compute offset
+        bottom_data += bottom[0]->offset(0, 1);
+        top_data += top[0]->offset(0, 1);
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    NOT_IMPLEMENTED;
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	// Different pooling methods. We explicitly do the switch outside the for
-	// loop to save time, although this results in more codes.
-	caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
-	// We'll output the mask to top[1] if it's of size >1.
-	const bool use_top_mask = top.size() > 1;
-	const int* mask = NULL;  // suppress warnings about uninitialized variables
-	const Dtype* top_mask = NULL;
-	switch (this->layer_param_.pooling_param().pool()) {
-		case PoolingParameter_PoolMethod_MAX:
-			// The main loop
-			if (use_top_mask) {
-				top_mask = top[1]->cpu_data();
-			} else {
-				mask = max_idx_.cpu_data();
-			}
-			for (int n = 0; n < top[0]->num(); ++n) {
-				for (int c = 0; c < channels_; ++c) {
-					for (int ph = 0; ph < pooled_height_; ++ph) {
-						for (int pw = 0; pw < pooled_width_; ++pw) {
-							const int index = ph * pooled_width_ + pw;
-							const int bottom_index =
-									use_top_mask ? top_mask[index] : mask[index];
-							bottom_diff[bottom_index] += top_diff[index];
-						}
-					}
-					bottom_diff += bottom[0]->offset(0, 1);
-					top_diff += top[0]->offset(0, 1);
-					if (use_top_mask) {
-						top_mask += top[0]->offset(0, 1);
-					} else {
-						mask += top[0]->offset(0, 1);
-					}
-				}
-			}
-			break;
-		case PoolingParameter_PoolMethod_AVE:
-			// The main loop
-			for (int n = 0; n < top[0]->num(); ++n) {
-				for (int c = 0; c < channels_; ++c) {
-					for (int ph = 0; ph < pooled_height_; ++ph) {
-						for (int pw = 0; pw < pooled_width_; ++pw) {
-							int hstart = ph * stride_h_ - pad_h_;
-							int wstart = pw * stride_w_ - pad_w_;
-							int hend = min(hstart + kernel_h_, height_ + pad_h_);
-							int wend = min(wstart + kernel_w_, width_ + pad_w_);
-							int pool_size = (hend - hstart) * (wend - wstart);
-							hstart = max(hstart, 0);
-							wstart = max(wstart, 0);
-							hend = min(hend, height_);
-							wend = min(wend, width_);
-							for (int h = hstart; h < hend; ++h) {
-								for (int w = wstart; w < wend; ++w) {
-									bottom_diff[h * width_ + w] +=
-											top_diff[ph * pooled_width_ + pw] / pool_size;
-								}
-							}
-						}
-					}
-					// offset
-					bottom_diff += bottom[0]->offset(0, 1);
-					top_diff += top[0]->offset(0, 1);
-				}
-			}
-			break;
-		case PoolingParameter_PoolMethod_STOCHASTIC:
-			NOT_IMPLEMENTED;
-			break;
-		default:
-			LOG(FATAL) << "Unknown pooling method.";
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  // Different pooling methods. We explicitly do the switch outside the for
+  // loop to save time, although this results in more codes.
+  caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  const int* mask = NULL;  // suppress warnings about uninitialized variables
+  const Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    // The main loop
+    if (use_top_mask) {
+      top_mask = top[1]->cpu_data();
+    } else {
+      mask = max_idx_.cpu_data();
+    }
+    for (int n = 0; n < top[0]->num(); ++n) {
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_height_; ++ph) {
+          for (int pw = 0; pw < pooled_width_; ++pw) {
+            const int index = ph * pooled_width_ + pw;
+            const int bottom_index =
+                use_top_mask ? top_mask[index] : mask[index];
+            bottom_diff[bottom_index] += top_diff[index];
+          }
+        }
+        bottom_diff += bottom[0]->offset(0, 1);
+        top_diff += top[0]->offset(0, 1);
+        if (use_top_mask) {
+          top_mask += top[0]->offset(0, 1);
+        } else {
+          mask += top[0]->offset(0, 1);
+        }
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // The main loop
+    for (int n = 0; n < top[0]->num(); ++n) {
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_height_; ++ph) {
+          for (int pw = 0; pw < pooled_width_; ++pw) {
+            int hstart = ph * stride_h_ - pad_h_;
+            int wstart = pw * stride_w_ - pad_w_;
+            int hend = min(hstart + kernel_h_, height_ + pad_h_);
+            int wend = min(wstart + kernel_w_, width_ + pad_w_);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend = min(hend, height_);
+            wend = min(wend, width_);
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                bottom_diff[h * width_ + w] += top_diff[ph * pooled_width_ + pw]
+                    / pool_size;
+              }
+            }
+          }
+        }
+        // offset
+        bottom_diff += bottom[0]->offset(0, 1);
+        top_diff += top[0]->offset(0, 1);
+      }
+    }
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    NOT_IMPLEMENTED;
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	//Forward_cpu(bottom, top);
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	int count = top[0]->count();
-	// We'll output the mask to top[1] if it's of size >1.
-	const bool use_top_mask = top.size() > 1;
-	int* mask = NULL;
-	Dtype* top_mask = NULL;
-	switch (this->layer_param_.pooling_param().pool()) {
-		case PoolingParameter_PoolMethod_MAX:
-			if (use_top_mask) {
-				top_mask = top[1]->mutable_gpu_data();
-			} else {
-				mask = max_idx_.mutable_gpu_data();
-			}
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_, kernel_h_,
-					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-					mask, top_mask);
-			break;
-		case PoolingParameter_PoolMethod_AVE:
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			AvePoolForward(count, bottom_data, bottom[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_, kernel_h_,
-					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-			break;
-		case PoolingParameter_PoolMethod_STOCHASTIC:
-			if (this->phase_ == TRAIN) {
-				// We need to create the random index as well.
-				caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-						rand_idx_.mutable_gpu_data());
-				// NOLINT_NEXT_LINE(whitespace/operators)
-				StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
-						height_, width_, pooled_height_, pooled_width_, kernel_h_,
-						kernel_w_, stride_h_, stride_w_,
-						rand_idx_.mutable_gpu_data(), top_data);
-			} else {
-				// NOLINT_NEXT_LINE(whitespace/operators)
-				StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
-						height_, width_, pooled_height_, pooled_width_, kernel_h_,
-						kernel_w_, stride_h_, stride_w_, top_data);
-			}
-			break;
-		default:
-			LOG(FATAL) << "Unknown pooling method.";
-	}
+    const vector<Blob<Dtype>*>& top) {
+  //Forward_cpu(bottom, top);
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int count = top[0]->count();
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  int* mask = NULL;
+  Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->mutable_gpu_data();
+    } else {
+      mask = max_idx_.mutable_gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, height_,
+        width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, top_data, mask, top_mask);
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_,
+        width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, top_data);
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    if (this->phase_ == TRAIN) {
+      // We need to create the random index as well.
+      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
+          rand_idx_.mutable_gpu_data());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+          stride_h_, stride_w_, rand_idx_.mutable_gpu_data(), top_data);
+    } else {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+          stride_h_, stride_w_, top_data);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	//Backward_cpu(top, propagate_down, bottom);
-	if (!propagate_down[0]) {
-		return;
-	}
-	const Dtype* top_diff = top[0]->gpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	const int count = bottom[0]->count();
-	caffe_gpu_set(count, Dtype(0.), bottom_diff);
-	// We'll output the mask to top[1] if it's of size >1.
-	const bool use_top_mask = top.size() > 1;
-	const int* mask = NULL;
-	const Dtype* top_mask = NULL;
-	switch (this->layer_param_.pooling_param().pool()) {
-		case PoolingParameter_PoolMethod_MAX:
-			if (use_top_mask) {
-				top_mask = top[1]->gpu_data();
-			} else {
-				mask = max_idx_.gpu_data();
-			}
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_,
-					kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-					bottom_diff);
-			break;
-		case PoolingParameter_PoolMethod_AVE:
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			AvePoolBackward(count, top_diff, top[0]->num(), channels_,
-					height_, width_, pooled_height_, pooled_width_, kernel_h_,
-					kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-			break;
-		case PoolingParameter_PoolMethod_STOCHASTIC:
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			StoPoolBackward(count, rand_idx_.gpu_data(), top_diff,
-					top[0]->num(), channels_, height_, width_, pooled_height_,
-					pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-					bottom_diff);
-			break;
-		default:
-			LOG(FATAL) << "Unknown pooling method.";
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  //Backward_cpu(top, propagate_down, bottom);
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int count = bottom[0]->count();
+  caffe_gpu_set(count, Dtype(0.), bottom_diff);
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  const int* mask = NULL;
+  const Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->gpu_data();
+    } else {
+      mask = max_idx_.gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+        stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolBackward(count, top_diff, top[0]->num(), channels_, height_, width_,
+        pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, top[0]->num(),
+        channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, bottom_diff);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
 }
 
 // end: code written/modified by AMD
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 0cf82c35..6b2c5f1d 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -11,163 +11,162 @@ namespace caffe {
 
 template <typename Dtype>
 void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	power_ = this->layer_param_.power_param().power();
-	scale_ = this->layer_param_.power_param().scale();
-	shift_ = this->layer_param_.power_param().shift();
-	diff_scale_ = power_ * scale_;
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  power_ = this->layer_param_.power_param().power();
+  scale_ = this->layer_param_.power_param().scale();
+  shift_ = this->layer_param_.power_param().shift();
+  diff_scale_ = power_ * scale_;
 }
 
 // Compute y = (shift + scale * x)^power
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	// Special case where we can ignore the input: scale or power is 0.
-	if (diff_scale_ == Dtype(0)) {
-		Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-		caffe_set(count, value, top_data);
-		return;
-	}
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	caffe_copy(count, bottom_data, top_data);
-	if (scale_ != Dtype(1)) {
-		caffe_scal(count, scale_, top_data);
-	}
-	if (shift_ != Dtype(0)) {
-		caffe_add_scalar(count, shift_, top_data);
-	}
-	if (power_ != Dtype(1)) {
-		caffe_powx(count, top_data, power_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  // Special case where we can ignore the input: scale or power is 0.
+  if (diff_scale_ == Dtype(0)) {
+    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+    caffe_set(count, value, top_data);
+    return;
+  }
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  caffe_copy(count, bottom_data, top_data);
+  if (scale_ != Dtype(1)) {
+    caffe_scal(count, scale_, top_data);
+  }
+  if (shift_ != Dtype(0)) {
+    caffe_add_scalar(count, shift_, top_data);
+  }
+  if (power_ != Dtype(1)) {
+    caffe_powx(count, top_data, power_, top_data);
+  }
 }
 
 template <typename Dtype>
 void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const int count = bottom[0]->count();
-		const Dtype* top_diff = top[0]->cpu_diff();
-		if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-			caffe_set(count, diff_scale_, bottom_diff);
-		} else {
-			const Dtype* bottom_data = bottom[0]->cpu_data();
-			// Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-			//               = diff_scale * y / (shift + scale * x)
-			if (power_ == Dtype(2)) {
-				// Special case for y = (shift + scale * x)^2
-				//     -> dy/dx = 2 * scale * (shift + scale * x)
-				//              = diff_scale * shift + diff_scale * scale * x
-				caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
-						Dtype(0), bottom_diff);
-				if (shift_ != Dtype(0)) {
-					caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-				}
-			} else if (shift_ == Dtype(0)) {
-				// Special case for y = (scale * x)^power
-				//     -> dy/dx = scale * power * (scale * x)^(power - 1)
-				//              = scale * power * (scale * x)^power * (scale * x)^(-1)
-				//              = power * y / x
-				const Dtype* top_data = top[0]->cpu_data();
-				caffe_div(count, top_data, bottom_data, bottom_diff);
-				caffe_scal(count, power_, bottom_diff);
-			} else {
-				caffe_copy(count, bottom_data, bottom_diff);
-				if (scale_ != Dtype(1)) {
-					caffe_scal(count, scale_, bottom_diff);
-				}
-				if (shift_ != Dtype(0)) {
-					caffe_add_scalar(count, shift_, bottom_diff);
-				}
-				const Dtype* top_data = top[0]->cpu_data();
-				caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff);
-				if (diff_scale_ != Dtype(1)) {
-					caffe_scal(count, diff_scale_, bottom_diff);
-				}
-			}
-		}
-		if (diff_scale_ != Dtype(0)) {
-			caffe_mul(count, top_diff, bottom_diff, bottom_diff);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+      caffe_set(count, diff_scale_, bottom_diff);
+    } else {
+      const Dtype* bottom_data = bottom[0]->cpu_data();
+      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+      //               = diff_scale * y / (shift + scale * x)
+      if (power_ == Dtype(2)) {
+        // Special case for y = (shift + scale * x)^2
+        //     -> dy/dx = 2 * scale * (shift + scale * x)
+        //              = diff_scale * shift + diff_scale * scale * x
+        caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0),
+            bottom_diff);
+        if (shift_ != Dtype(0)) {
+          caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+        }
+      } else if (shift_ == Dtype(0)) {
+        // Special case for y = (scale * x)^power
+        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
+        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
+        //              = power * y / x
+        const Dtype* top_data = top[0]->cpu_data();
+        caffe_div(count, top_data, bottom_data, bottom_diff);
+        caffe_scal(count, power_, bottom_diff);
+      } else {
+        caffe_copy(count, bottom_data, bottom_diff);
+        if (scale_ != Dtype(1)) {
+          caffe_scal(count, scale_, bottom_diff);
+        }
+        if (shift_ != Dtype(0)) {
+          caffe_add_scalar(count, shift_, bottom_diff);
+        }
+        const Dtype* top_data = top[0]->cpu_data();
+        caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff);
+        if (diff_scale_ != Dtype(1)) {
+          caffe_scal(count, diff_scale_, bottom_diff);
+        }
+      }
+    }
+    if (diff_scale_ != Dtype(0)) {
+      caffe_mul(count, top_diff, bottom_diff, bottom_diff);
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	// Special case where we can ignore the input: scale or power is 0.
-	if (diff_scale_ == Dtype(0)) {
-		Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-		ocl_memset(top_data, value, count);
-		return;
-	}
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	caffe_gpu_copy(count, bottom_data, top_data);
-	if (scale_ != Dtype(1)) {
-		caffe_gpu_scal(count, scale_, top_data);
-	}
-	if (shift_ != Dtype(0)) {
-		caffe_gpu_add_scalar(count, shift_, top_data);
-	}
-	if (power_ != Dtype(1)) {
-		caffe_gpu_powx(count, top_data, power_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // Special case where we can ignore the input: scale or power is 0.
+  if (diff_scale_ == Dtype(0)) {
+    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+    ocl_memset(top_data, value, count);
+    return;
+  }
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  caffe_gpu_copy(count, bottom_data, top_data);
+  if (scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, scale_, top_data);
+  }
+  if (shift_ != Dtype(0)) {
+    caffe_gpu_add_scalar(count, shift_, top_data);
+  }
+  if (power_ != Dtype(1)) {
+    caffe_gpu_powx(count, top_data, power_, top_data);
+  }
 }
 
 template <typename Dtype>
 void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const int count = bottom[0]->count();
-		const Dtype* top_diff = top[0]->gpu_diff();
-		if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-			ocl_memset(bottom_diff, diff_scale_, count);
-		} else {
-			const Dtype* bottom_data = bottom[0]->gpu_data();
-			// Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-			//               = diff_scale * y / (shift + scale * x)
-			if (power_ == Dtype(2)) {
-				// Special case for y = (shift + scale * x)^2
-				//     -> dy/dx = 2 * scale * (shift + scale * x)
-				//              = diff_scale * shift + diff_scale * scale * x
-				caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-						Dtype(0), bottom_diff);
-				if (shift_ != Dtype(0)) {
-					caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-				}
-			} else if (shift_ == Dtype(0)) {
-				// Special case for y = (scale * x)^power
-				//     -> dy/dx = scale * power * (scale * x)^(power - 1)
-				//              = scale * power * (scale * x)^power * (scale * x)^(-1)
-				//              = power * y / x
-				const Dtype* top_data = top[0]->gpu_data();
-				caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-				caffe_gpu_scal(count, power_, bottom_diff);
-			} else {
-				caffe_gpu_copy(count, bottom_data, bottom_diff);
-				if (scale_ != Dtype(1)) {
-					caffe_gpu_scal(count, scale_, bottom_diff);
-				}
-				if (shift_ != Dtype(0)) {
-					caffe_gpu_add_scalar(count, shift_, bottom_diff);
-				}
-				const Dtype* top_data = top[0]->gpu_data();
-				caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
-				if (diff_scale_ != Dtype(1)) {
-					caffe_gpu_scal(count, diff_scale_, bottom_diff);
-				}
-			}
-		}
-		caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+      ocl_memset(bottom_diff, diff_scale_, count);
+    } else {
+      const Dtype* bottom_data = bottom[0]->gpu_data();
+      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+      //               = diff_scale * y / (shift + scale * x)
+      if (power_ == Dtype(2)) {
+        // Special case for y = (shift + scale * x)^2
+        //     -> dy/dx = 2 * scale * (shift + scale * x)
+        //              = diff_scale * shift + diff_scale * scale * x
+        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0),
+            bottom_diff);
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+        }
+      } else if (shift_ == Dtype(0)) {
+        // Special case for y = (scale * x)^power
+        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
+        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
+        //              = power * y / x
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        caffe_gpu_scal(count, power_, bottom_diff);
+      } else {
+        caffe_gpu_copy(count, bottom_data, bottom_diff);
+        if (scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, scale_, bottom_diff);
+        }
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(count, shift_, bottom_diff);
+        }
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
+        if (diff_scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, diff_scale_, bottom_diff);
+        }
+      }
+    }
+    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+  }
 }
 // end: code written/modified by AMD
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index cbf7f064..8ec6664d 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -9,197 +9,192 @@ namespace caffe {
 
 template <typename Dtype>
 void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_GE(bottom[0]->num_axes(), 2)
-			<< "Number of axes of bottom blob must be >=2.";
-	PReLUParameter prelu_param = this->layer_param().prelu_param();
-	int channels = bottom[0]->channels();
-	channel_shared_ = prelu_param.channel_shared();
-	if (this->blobs_.size() > 0) {
-		LOG(INFO) << "Skipping parameter initialization";
-	} else {
-		this->blobs_.resize(1);
-		if (channel_shared_) {
-			this->blobs_[0].reset(new Blob<Dtype>(vector<int>(0)));
-		} else {
-			this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
-		}
-		shared_ptr < Filler<Dtype> > filler;
-		if (prelu_param.has_filler()) {
-			filler.reset(GetFiller < Dtype > (prelu_param.filler()));
-		} else {
-			FillerParameter filler_param;
-			filler_param.set_type("constant");
-			filler_param.set_value(0.25);
-			filler.reset(GetFiller < Dtype > (filler_param));
-		}
-		filler->Fill(this->blobs_[0].get());
-	}
-	if (channel_shared_) {
-		CHECK_EQ(this->blobs_[0]->count(), 1)
-				<< "Negative slope size is inconsistent with prototxt config";
-	} else {
-		CHECK_EQ(this->blobs_[0]->count(), channels)
-				<< "Negative slope size is inconsistent with prototxt config";
-	}
-
-	// Propagate gradients to the parameters (as directed by backward pass).
-	this->param_propagate_down_.resize(this->blobs_.size(), true);
-	multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
-	backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
-	caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "Number of axes of bottom blob must be >=2.";
+  PReLUParameter prelu_param = this->layer_param().prelu_param();
+  int channels = bottom[0]->channels();
+  channel_shared_ = prelu_param.channel_shared();
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    this->blobs_.resize(1);
+    if (channel_shared_) {
+      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(0)));
+    } else {
+      this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
+    }
+    shared_ptr < Filler<Dtype> > filler;
+    if (prelu_param.has_filler()) {
+      filler.reset(GetFiller < Dtype > (prelu_param.filler()));
+    } else {
+      FillerParameter filler_param;
+      filler_param.set_type("constant");
+      filler_param.set_value(0.25);
+      filler.reset(GetFiller < Dtype > (filler_param));
+    }
+    filler->Fill(this->blobs_[0].get());
+  }
+  if (channel_shared_) {
+    CHECK_EQ(this->blobs_[0]->count(), 1)
+        << "Negative slope size is inconsistent with prototxt config";
+  } else {
+    CHECK_EQ(this->blobs_[0]->count(), channels)
+        << "Negative slope size is inconsistent with prototxt config";
+  }
+
+  // Propagate gradients to the parameters (as directed by backward pass).
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+  multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
+  backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
+  caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_GE(bottom[0]->num_axes(), 2)
-			<< "Number of axes of bottom blob must be >=2.";
-	top[0]->ReshapeLike(*bottom[0]);
-	if (bottom[0] == top[0]) {
-		// For in-place computation
-		bottom_memory_.ReshapeLike(*bottom[0]);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "Number of axes of bottom blob must be >=2.";
+  top[0]->ReshapeLike(*bottom[0]);
+  if (bottom[0] == top[0]) {
+    // For in-place computation
+    bottom_memory_.ReshapeLike(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	const int dim = bottom[0]->count(2);
-	const int channels = bottom[0]->channels();
-	const Dtype* slope_data = this->blobs_[0]->cpu_data();
-
-	// For in-place computation
-	if (bottom[0] == top[0]) {
-		caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data());
-	}
-
-	// if channel_shared, channel index in the following computation becomes
-	// always zero.
-	const int div_factor = channel_shared_ ? channels : 1;
-	for (int i = 0; i < count; ++i) {
-		int c = (i / dim) % channels / div_factor;
-		top_data[i] = std::max(bottom_data[i], Dtype(0))
-				+ slope_data[c] * std::min(bottom_data[i], Dtype(0));
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+  const Dtype* slope_data = this->blobs_[0]->cpu_data();
+
+  // For in-place computation
+  if (bottom[0] == top[0]) {
+    caffe_copy(count, bottom_data, bottom_memory_.mutable_cpu_data());
+  }
+
+  // if channel_shared, channel index in the following computation becomes
+  // always zero.
+  const int div_factor = channel_shared_ ? channels : 1;
+  for (int i = 0; i < count; ++i) {
+    int c = (i / dim) % channels / div_factor;
+    top_data[i] = std::max(bottom_data[i], Dtype(0))
+        + slope_data[c] * std::min(bottom_data[i], Dtype(0));
+  }
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* slope_data = this->blobs_[0]->cpu_data();
-	const Dtype* top_diff = top[0]->cpu_diff();
-	const int count = bottom[0]->count();
-	const int dim = bottom[0]->count(2);
-	const int channels = bottom[0]->channels();
-
-	// For in-place computation
-	if (top[0] == bottom[0]) {
-		bottom_data = bottom_memory_.cpu_data();
-	}
-
-	// if channel_shared, channel index in the following computation becomes
-	// always zero.
-	const int div_factor = channel_shared_ ? channels : 1;
-
-	// Propagte to param
-	// Since to write bottom diff will affect top diff if top and bottom blobs
-	// are identical (in-place computaion), we first compute param backward to
-	// keep top_diff unchanged.
-	if (this->param_propagate_down_[0]) {
-		Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
-		for (int i = 0; i < count; ++i) {
-			int c = (i / dim) % channels / div_factor;
-			slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0);
-		}
-	}
-	// Propagate to bottom
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		for (int i = 0; i < count; ++i) {
-			int c = (i / dim) % channels / div_factor;
-			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-					+ slope_data[c] * (bottom_data[i] <= 0));
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* slope_data = this->blobs_[0]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+
+  // For in-place computation
+  if (top[0] == bottom[0]) {
+    bottom_data = bottom_memory_.cpu_data();
+  }
+
+  // if channel_shared, channel index in the following computation becomes
+  // always zero.
+  const int div_factor = channel_shared_ ? channels : 1;
+
+  // Propagte to param
+  // Since to write bottom diff will affect top diff if top and bottom blobs
+  // are identical (in-place computaion), we first compute param backward to
+  // keep top_diff unchanged.
+  if (this->param_propagate_down_[0]) {
+    Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
+    for (int i = 0; i < count; ++i) {
+      int c = (i / dim) % channels / div_factor;
+      slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0);
+    }
+  }
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    for (int i = 0; i < count; ++i) {
+      int c = (i / dim) % channels / div_factor;
+      bottom_diff[i] = top_diff[i]
+          * ((bottom_data[i] > 0) + slope_data[c] * (bottom_data[i] <= 0));
+    }
+  }
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	const int dim = bottom[0]->count(2);
-	const int channels = bottom[0]->channels();
-	const Dtype* slope_data = this->blobs_[0]->gpu_data();
-	const int div_factor = channel_shared_ ? channels : 1;
-
-	if (top[0] == bottom[0]) {
-		caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-	}
-	PReLUForward(count, channels, dim, bottom_data, top_data, slope_data,
-			div_factor);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+  const Dtype* slope_data = this->blobs_[0]->gpu_data();
+  const int div_factor = channel_shared_ ? channels : 1;
+
+  if (top[0] == bottom[0]) {
+    caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
+  }
+  PReLUForward(count, channels, dim, bottom_data, top_data, slope_data,
+      div_factor);
 }
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	const Dtype* top_diff = top[0]->gpu_diff();
-	const int count = bottom[0]->count();
-	const int dim = bottom[0]->count(2);
-	const int channels = bottom[0]->channels();
-
-	if (top[0] == bottom[0]) {
-		bottom_data = bottom_memory_.gpu_data();
-	}
-
-	// Propagate to param
-	// Since to write bottom diff will affect top diff if top and bottom blobs
-	// are identical (in-place computaion), we first compute param backward to
-	// keep top_diff unchanged.
-	if (this->param_propagate_down_[0]) {
-		Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-		int cdim = channels * dim;
-		Dtype dsum = 0.;
-		for (int n = 0; n < bottom[0]->num(); ++n) {
-			// compute element-wise diff
-			// NOLINT_NEXT_LINE(whitespace/operators)
-			PReLUParamBackward(
-					cdim, top_diff, top[0]->offset(n),
-					bottom_data, bottom[0]->offset(n),
-					backward_buff_.mutable_gpu_diff());
-			if (channel_shared_) {
-				Dtype d;
-				caffe_gpu_dot < Dtype > (channels * dim, backward_buff_.gpu_diff(),
-						multiplier_.gpu_data(), &d);
-				dsum += d;
-			} else {
-				caffe_gpu_gemv < Dtype > (CblasNoTrans, channels, dim, 1.,
-						backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-						slope_diff);
-			}
-		}
-		if (channel_shared_) {
-			caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-		}
-	}
-	// Propagate to bottom
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const Dtype* slope_data = this->blobs_[0]->gpu_data();
-		int div_factor = channel_shared_ ? channels : 1;
-		// NOLINT_NEXT_LINE(whitespace/operators)
-		PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff,
-				slope_data,
-				div_factor);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+
+  if (top[0] == bottom[0]) {
+    bottom_data = bottom_memory_.gpu_data();
+  }
+
+  // Propagate to param
+  // Since to write bottom diff will affect top diff if top and bottom blobs
+  // are identical (in-place computaion), we first compute param backward to
+  // keep top_diff unchanged.
+  if (this->param_propagate_down_[0]) {
+    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
+    int cdim = channels * dim;
+    Dtype dsum = 0.;
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      // compute element-wise diff
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      PReLUParamBackward(cdim, top_diff, top[0]->offset(n), bottom_data,
+          bottom[0]->offset(n), backward_buff_.mutable_gpu_diff());
+      if (channel_shared_) {
+        Dtype d;
+        caffe_gpu_dot < Dtype
+            > (channels * dim, backward_buff_.gpu_diff(), multiplier_.gpu_data(), &d);
+        dsum += d;
+      } else {
+        caffe_gpu_gemv < Dtype
+            > (CblasNoTrans, channels, dim, 1., backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., slope_diff);
+      }
+    }
+    if (channel_shared_) {
+      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+    }
+  }
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* slope_data = this->blobs_[0]->gpu_data();
+    int div_factor = channel_shared_ ? channels : 1;
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff,
+        slope_data, div_factor);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index ddf70e46..89df6589 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -10,201 +10,201 @@ namespace caffe {
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	op_ = this->layer_param_.reduction_param().operation();
+    const vector<Blob<Dtype>*>& top) {
+  op_ = this->layer_param_.reduction_param().operation();
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	axis_ = bottom[0]->CanonicalAxisIndex(
-			this->layer_param_.reduction_param().axis());
-	// In the output, we'll keep all axes up to the reduction axis, but
-	// throw away any after that.
-	// Note: currently reducing along non-tail axes is not supported; otherwise,
-	// we'd need to also copy any axes following an "end_axis".
-	vector<int> top_shape(bottom[0]->shape().begin(),
-			bottom[0]->shape().begin() + axis_);
-	top[0]->Reshape(top_shape);
-	num_ = bottom[0]->count(0, axis_);
-	dim_ = bottom[0]->count(axis_);
-	CHECK_EQ(num_, top[0]->count());
-	if (op_ == ReductionParameter_ReductionOp_SUM ||
-			op_ == ReductionParameter_ReductionOp_MEAN) {
-		vector<int> sum_mult_shape(1, dim_);
-		sum_multiplier_.Reshape(sum_mult_shape);
-		caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
-	}
-	coeff_ = this->layer_param().reduction_param().coeff();
-	if (op_ == ReductionParameter_ReductionOp_MEAN) {
-		coeff_ /= dim_;
-	}
+    const vector<Blob<Dtype>*>& top) {
+  axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.reduction_param().axis());
+  // In the output, we'll keep all axes up to the reduction axis, but
+  // throw away any after that.
+  // Note: currently reducing along non-tail axes is not supported; otherwise,
+  // we'd need to also copy any axes following an "end_axis".
+  vector<int> top_shape(bottom[0]->shape().begin(),
+      bottom[0]->shape().begin() + axis_);
+  top[0]->Reshape(top_shape);
+  num_ = bottom[0]->count(0, axis_);
+  dim_ = bottom[0]->count(axis_);
+  CHECK_EQ(num_, top[0]->count());
+  if (op_ == ReductionParameter_ReductionOp_SUM
+      || op_ == ReductionParameter_ReductionOp_MEAN) {
+    vector<int> sum_mult_shape(1, dim_);
+    sum_multiplier_.Reshape(sum_mult_shape);
+    caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
+  }
+  coeff_ = this->layer_param().reduction_param().coeff();
+  if (op_ == ReductionParameter_ReductionOp_MEAN) {
+    coeff_ /= dim_;
+  }
 }
 
 template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const Dtype* mult_data = NULL;
-	if (sum_multiplier_.count() > 0) {
-		mult_data = sum_multiplier_.cpu_data();
-	}
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	for (int i = 0; i < num_; ++i) {
-		switch (op_) {
-			case ReductionParameter_ReductionOp_SUM:
-				case ReductionParameter_ReductionOp_MEAN:
-				*top_data = caffe_cpu_dot(dim_, mult_data, bottom_data);
-				break;
-			case ReductionParameter_ReductionOp_ASUM:
-				*top_data = caffe_cpu_asum(dim_, bottom_data);
-				break;
-			case ReductionParameter_ReductionOp_SUMSQ:
-				*top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data);
-				break;
-			default:
-				LOG(FATAL) << "Unknown reduction op: "
-						<< ReductionParameter_ReductionOp_Name(op_);
-		}
-		bottom_data += dim_;
-		++top_data;
-	}
-	if (coeff_ != Dtype(1)) {
-		// Reset the top_data pointer.
-		top_data = top[0]->mutable_cpu_data();
-		caffe_scal(num_, coeff_, top_data);
-	}
+void ReductionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* mult_data = NULL;
+  if (sum_multiplier_.count() > 0) {
+    mult_data = sum_multiplier_.cpu_data();
+  }
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int i = 0; i < num_; ++i) {
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      *top_data = caffe_cpu_dot(dim_, mult_data, bottom_data);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      *top_data = caffe_cpu_asum(dim_, bottom_data);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      *top_data = caffe_cpu_dot(dim_, bottom_data, bottom_data);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    ++top_data;
+  }
+  if (coeff_ != Dtype(1)) {
+    // Reset the top_data pointer.
+    top_data = top[0]->mutable_cpu_data();
+    caffe_scal(num_, coeff_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	// Get bottom_data, if needed.
-	const Dtype* bottom_data = NULL;
-	switch (op_) {
-		// Operations that don't need bottom_data
-		case ReductionParameter_ReductionOp_SUM:
-			case ReductionParameter_ReductionOp_MEAN:
-			break;
-			// Operations that need bottom_data
-		case ReductionParameter_ReductionOp_ASUM:
-			case ReductionParameter_ReductionOp_SUMSQ:
-			bottom_data = bottom[0]->cpu_data();
-			break;
-		default:
-			LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
-	}
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	for (int i = 0; i < num_; ++i) {
-		const Dtype bottom_coeff = (*top_diff) * coeff_;
-		switch (op_) {
-			case ReductionParameter_ReductionOp_SUM:
-				case ReductionParameter_ReductionOp_MEAN:
-				caffe_set(dim_, bottom_coeff, bottom_diff);
-				break;
-			case ReductionParameter_ReductionOp_ASUM:
-				caffe_cpu_sign(dim_, bottom_data, bottom_diff);
-				caffe_scal(dim_, bottom_coeff, bottom_diff);
-				break;
-			case ReductionParameter_ReductionOp_SUMSQ:
-				caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-				break;
-			default:
-				LOG(FATAL) << "Unknown reduction op: "
-						<< ReductionParameter_ReductionOp_Name(op_);
-		}
-		bottom_data += dim_;
-		bottom_diff += dim_;
-		++top_diff;
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  // Get bottom_data, if needed.
+  const Dtype* bottom_data = NULL;
+  switch (op_) {
+  // Operations that don't need bottom_data
+  case ReductionParameter_ReductionOp_SUM:
+  case ReductionParameter_ReductionOp_MEAN:
+    break;
+    // Operations that need bottom_data
+  case ReductionParameter_ReductionOp_ASUM:
+  case ReductionParameter_ReductionOp_SUMSQ:
+    bottom_data = bottom[0]->cpu_data();
+    break;
+  default:
+    LOG(FATAL) << "Unknown reduction op: "
+        << ReductionParameter_ReductionOp_Name(op_);
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  for (int i = 0; i < num_; ++i) {
+    const Dtype bottom_coeff = (*top_diff) * coeff_;
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_set(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_cpu_sign(dim_, bottom_data, bottom_diff);
+      caffe_scal(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_cpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    bottom_diff += dim_;
+    ++top_diff;
+  }
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	const Dtype* mult_data = NULL;
-	if (sum_multiplier_.count() > 0) {
-		mult_data = sum_multiplier_.gpu_data();
-	}
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	for (int i = 0; i < num_; ++i) {
-		switch (op_) {
-			case ReductionParameter_ReductionOp_SUM:
-				case ReductionParameter_ReductionOp_MEAN:
-				caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-				break;
-			case ReductionParameter_ReductionOp_ASUM:
-				caffe_gpu_asum(dim_, bottom_data, top_data);
-				break;
-			case ReductionParameter_ReductionOp_SUMSQ:
-				caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-				break;
-			default:
-				LOG(FATAL) << "Unknown reduction op: "
-						<< ReductionParameter_ReductionOp_Name(op_);
-		}
-		bottom_data += dim_;
-		++top_data;
-	}
-	if (coeff_ != Dtype(1)) {
-		// Reset the top_data pointer.
-		top_data = top[0]->mutable_gpu_data();
-		caffe_gpu_scal(num_, coeff_, top_data);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* mult_data = NULL;
+  if (sum_multiplier_.count() > 0) {
+    mult_data = sum_multiplier_.gpu_data();
+  }
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int i = 0; i < num_; ++i) {
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_asum(dim_, bottom_data, top_data);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    ++top_data;
+  }
+  if (coeff_ != Dtype(1)) {
+    // Reset the top_data pointer.
+    top_data = top[0]->mutable_gpu_data();
+    caffe_gpu_scal(num_, coeff_, top_data);
+  }
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	// Get bottom_data, if needed.
-	const Dtype* bottom_data = NULL;
-	switch (op_) {
-		// Operations that don't need bottom_data
-		case ReductionParameter_ReductionOp_SUM:
-			case ReductionParameter_ReductionOp_MEAN:
-			break;
-			// Operations that need bottom_data
-		case ReductionParameter_ReductionOp_ASUM:
-			case ReductionParameter_ReductionOp_SUMSQ:
-			bottom_data = bottom[0]->gpu_data();
-			break;
-		default:
-			LOG(FATAL) << "Unknown reduction op: "
-					<< ReductionParameter_ReductionOp_Name(op_);
-	}
-	const Dtype* top_diff = top[0]->cpu_diff();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	for (int i = 0; i < num_; ++i) {
-		const Dtype bottom_coeff = (*top_diff) * coeff_;
-		switch (op_) {
-			case ReductionParameter_ReductionOp_SUM:
-				case ReductionParameter_ReductionOp_MEAN:
-				caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-				break;
-			case ReductionParameter_ReductionOp_ASUM:
-				caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-				caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-				break;
-			case ReductionParameter_ReductionOp_SUMSQ:
-				caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-				break;
-			default:
-				LOG(FATAL) << "Unknown reduction op: "
-						<< ReductionParameter_ReductionOp_Name(op_);
-		}
-		bottom_data += dim_;
-		bottom_diff += dim_;
-		++top_diff;
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  // Get bottom_data, if needed.
+  const Dtype* bottom_data = NULL;
+  switch (op_) {
+  // Operations that don't need bottom_data
+  case ReductionParameter_ReductionOp_SUM:
+  case ReductionParameter_ReductionOp_MEAN:
+    break;
+    // Operations that need bottom_data
+  case ReductionParameter_ReductionOp_ASUM:
+  case ReductionParameter_ReductionOp_SUMSQ:
+    bottom_data = bottom[0]->gpu_data();
+    break;
+  default:
+    LOG(FATAL) << "Unknown reduction op: "
+        << ReductionParameter_ReductionOp_Name(op_);
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  for (int i = 0; i < num_; ++i) {
+    const Dtype bottom_coeff = (*top_diff) * coeff_;
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
+      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data += dim_;
+    bottom_diff += dim_;
+    ++top_diff;
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 334dc244..b07e6447 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -7,56 +7,54 @@
 namespace caffe {
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-	for (int i = 0; i < count; ++i) {
-		top_data[i] = std::max(bottom_data[i], Dtype(0))
-				+ negative_slope * std::min(bottom_data[i], Dtype(0));
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = std::max(bottom_data[i], Dtype(0))
+        + negative_slope * std::min(bottom_data[i], Dtype(0));
+  }
 }
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->cpu_data();
-		const Dtype* top_diff = top[0]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const int count = bottom[0]->count();
-		Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-		for (int i = 0; i < count; ++i) {
-			bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-					+ negative_slope * (bottom_data[i] <= 0));
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i]
+          * ((bottom_data[i] > 0) + negative_slope * (bottom_data[i] <= 0));
+    }
+  }
 }
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-	ReLUForward(count, bottom_data, top_data, negative_slope);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+  ReLUForward(count, bottom_data, top_data, negative_slope);
 }
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* bottom_data = bottom[0]->gpu_data();
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const int count = bottom[0]->count();
-		Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-		ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index 094e61ef..a2377d87 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -7,87 +7,87 @@ namespace caffe {
 
 template <typename Dtype>
 void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	inferred_axis_ = -1;
-	copy_axes_.clear();
-	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
-	const int top_num_axes = top_blob_shape.dim_size();
-	constant_count_ = 1;
-	for (int i = 0; i < top_num_axes; ++i) {
-		const int top_dim = top_blob_shape.dim(i);
-		if (top_dim == 0) {
-			copy_axes_.push_back(i);
-		} else if (top_dim == -1) {
-			CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
-					<< "-1 dims; at most a single (1) value of -1 may be specified";
-			inferred_axis_ = i;
-		} else {
-			constant_count_ *= top_dim;
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  inferred_axis_ = -1;
+  copy_axes_.clear();
+  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+  const int top_num_axes = top_blob_shape.dim_size();
+  constant_count_ = 1;
+  for (int i = 0; i < top_num_axes; ++i) {
+    const int top_dim = top_blob_shape.dim(i);
+    if (top_dim == 0) {
+      copy_axes_.push_back(i);
+    } else if (top_dim == -1) {
+      CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
+          << "-1 dims; at most a single (1) value of -1 may be specified";
+      inferred_axis_ = i;
+    } else {
+      constant_count_ *= top_dim;
+    }
+  }
 }
 
 template <typename Dtype>
 void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int input_start_axis = this->layer_param_.reshape_param().axis();
-	const int start_axis =
-			(input_start_axis >= 0) ? input_start_axis :
-																bottom[0]->num_axes() + input_start_axis + 1;
-	CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
-	CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
-			<< " out of range for " << bottom[0]->num_axes() << "-D input blob";
-	const int num_axes = this->layer_param_.reshape_param().num_axes();
-	CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
-	const int end_axis =
-			(num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
-	CHECK_LE(end_axis, bottom[0]->num_axes())
-			<< "end_axis = axis + num_axes is out of range";
-	const int num_axes_replaced = end_axis - start_axis;
-	const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
-	const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
-	const int num_new_axes = top_blob_shape.dim_size();
-	vector<int> top_shape(num_axes_retained + num_new_axes);
-	int top_shape_index = 0;
-	for (int i = 0; i < start_axis; ++i) {
-		top_shape[top_shape_index++] = bottom[0]->shape(i);
-	}
-	for (int i = 0; i < num_new_axes; ++i) {
-		top_shape[top_shape_index++] = top_blob_shape.dim(i);
-	}
-	for (int i = end_axis; i < bottom[0]->num_axes(); ++i) {
-		top_shape[top_shape_index++] = bottom[0]->shape(i);
-	}
-	CHECK_EQ(top_shape_index, top_shape.size());
-	for (int i = 0; i < copy_axes_.size(); ++i) {
-		const int copy_axis_index = copy_axes_[i];
-		CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
-				<< "new shape contains a 0, but there was no corresponding bottom axis "
-				<< "to copy";
-		top_shape[start_axis + copy_axis_index] =
-				bottom[0]->shape(start_axis + copy_axis_index);
-	}
-	if (inferred_axis_ >= 0) {
-		// A -1 dim was specified; infer the correct dimension by computing the
-		// product of the other dimensions.
-		int explicit_count = constant_count_;
-		explicit_count *= bottom[0]->count(0, start_axis);
-		explicit_count *= bottom[0]->count(end_axis);
-		for (int i = 0; i < copy_axes_.size(); ++i) {
-			const int copy_axis_index = copy_axes_[i];
-			explicit_count *= top_shape[start_axis + copy_axis_index];
-		}
-		CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
-				<< bottom[0]->count() << ") must be divisible by the product of "
-				<< "the specified dimensions (" << explicit_count << ")";
-		const int inferred_dim = bottom[0]->count() / explicit_count;
-		top_shape[start_axis + inferred_axis_] = inferred_dim;
-	}
-	top[0]->Reshape(top_shape);
-	CHECK_EQ(top[0]->count(), bottom[0]->count())
-			<< "output count must match input count";
-	top[0]->ShareData(*bottom[0]);
-	top[0]->ShareDiff(*bottom[0]);
+    const vector<Blob<Dtype>*>& top) {
+  const int input_start_axis = this->layer_param_.reshape_param().axis();
+  const int start_axis =
+      (input_start_axis >= 0) ?
+          input_start_axis : bottom[0]->num_axes() + input_start_axis + 1;
+  CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
+  CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
+      << " out of range for " << bottom[0]->num_axes() << "-D input blob";
+  const int num_axes = this->layer_param_.reshape_param().num_axes();
+  CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
+  const int end_axis =
+      (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
+  CHECK_LE(end_axis, bottom[0]->num_axes())
+      << "end_axis = axis + num_axes is out of range";
+  const int num_axes_replaced = end_axis - start_axis;
+  const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
+  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+  const int num_new_axes = top_blob_shape.dim_size();
+  vector<int> top_shape(num_axes_retained + num_new_axes);
+  int top_shape_index = 0;
+  for (int i = 0; i < start_axis; ++i) {
+    top_shape[top_shape_index++] = bottom[0]->shape(i);
+  }
+  for (int i = 0; i < num_new_axes; ++i) {
+    top_shape[top_shape_index++] = top_blob_shape.dim(i);
+  }
+  for (int i = end_axis; i < bottom[0]->num_axes(); ++i) {
+    top_shape[top_shape_index++] = bottom[0]->shape(i);
+  }
+  CHECK_EQ(top_shape_index, top_shape.size());
+  for (int i = 0; i < copy_axes_.size(); ++i) {
+    const int copy_axis_index = copy_axes_[i];
+    CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
+        << "new shape contains a 0, but there was no corresponding bottom axis "
+        << "to copy";
+    top_shape[start_axis + copy_axis_index] = bottom[0]->shape(
+        start_axis + copy_axis_index);
+  }
+  if (inferred_axis_ >= 0) {
+    // A -1 dim was specified; infer the correct dimension by computing the
+    // product of the other dimensions.
+    int explicit_count = constant_count_;
+    explicit_count *= bottom[0]->count(0, start_axis);
+    explicit_count *= bottom[0]->count(end_axis);
+    for (int i = 0; i < copy_axes_.size(); ++i) {
+      const int copy_axis_index = copy_axes_[i];
+      explicit_count *= top_shape[start_axis + copy_axis_index];
+    }
+    CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
+        << bottom[0]->count() << ") must be divisible by the product of "
+        << "the specified dimensions (" << explicit_count << ")";
+    const int inferred_dim = bottom[0]->count() / explicit_count;
+    top_shape[start_axis + inferred_axis_] = inferred_dim;
+  }
+  top[0]->Reshape(top_shape);
+  CHECK_EQ(top[0]->count(), bottom[0]->count())
+      << "output count must match input count";
+  top[0]->ShareData(*bottom[0]);
+  top[0]->ShareDiff(*bottom[0]);
 }
 
 INSTANTIATE_CLASS (ReshapeLayer);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 2a6d99e2..4048a8e8 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -10,87 +10,88 @@ namespace caffe {
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::LayerSetUp(bottom, top);
-	sigmoid_bottom_vec_.clear();
-	sigmoid_bottom_vec_.push_back(bottom[0]);
-	sigmoid_top_vec_.clear();
-	sigmoid_top_vec_.push_back(sigmoid_output_.get());
-	sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
+  sigmoid_bottom_vec_.clear();
+  sigmoid_bottom_vec_.push_back(bottom[0]);
+  sigmoid_top_vec_.clear();
+  sigmoid_top_vec_.push_back(sigmoid_output_.get());
+  sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::Reshape(bottom, top);
-	CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
-			"SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
-	sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->count(), bottom[1]->count())
+      << "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
+  sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// The forward pass computes the sigmoid outputs.
-	sigmoid_bottom_vec_[0] = bottom[0];
-	sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
-	// Compute the loss (negative log likelihood)
-	const int count = bottom[0]->count();
-	const int num = bottom[0]->num();
-	// Stable version of loss computation from input data
-	const Dtype* input_data = bottom[0]->cpu_data();
-	const Dtype* target = bottom[1]->cpu_data();
-	Dtype loss = 0;
-	for (int i = 0; i < count; ++i) {
-		loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-				log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
-	}
-	top[0]->mutable_cpu_data()[0] = loss / num;
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // The forward pass computes the sigmoid outputs.
+  sigmoid_bottom_vec_[0] = bottom[0];
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  // Compute the loss (negative log likelihood)
+  const int count = bottom[0]->count();
+  const int num = bottom[0]->num();
+  // Stable version of loss computation from input data
+  const Dtype* input_data = bottom[0]->cpu_data();
+  const Dtype* target = bottom[1]->cpu_data();
+  Dtype loss = 0;
+  for (int i = 0; i < count; ++i) {
+    loss -= input_data[i] * (target[i] - (input_data[i] >= 0))
+        - log(
+            1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+  }
+  top[0]->mutable_cpu_data()[0] = loss / num;
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
-		const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		// First, compute the diff
-		const int count = bottom[0]->count();
-		const int num = bottom[0]->num();
-		const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
-		const Dtype* target = bottom[1]->cpu_data();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		caffe_sub(count, sigmoid_output_data, target, bottom_diff);
-		// Scale down gradient
-		const Dtype loss_weight = top[0]->cpu_diff()[0];
-		caffe_scal(count, loss_weight / num, bottom_diff);
-	}
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    // First, compute the diff
+    const int count = bottom[0]->count();
+    const int num = bottom[0]->num();
+    const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
+    const Dtype* target = bottom[1]->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_sub(count, sigmoid_output_data, target, bottom_diff);
+    // Scale down gradient
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    caffe_scal(count, loss_weight / num, bottom_diff);
+  }
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-		const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		// First, compute the diff
-		const int count = bottom[0]->count();
-		const int num = bottom[0]->num();
-		const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-		const Dtype* target = bottom[1]->gpu_data();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
-		caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-		// Scale down gradient
-		const Dtype loss_weight = top[0]->cpu_diff()[0];
-		caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-	}
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    // First, compute the diff
+    const int count = bottom[0]->count();
+    const int num = bottom[0]->num();
+    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+    const Dtype* target = bottom[1]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
+    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+    // Scale down gradient
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 833e1ced..a4359920 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -10,57 +10,56 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-	return 1. / (1. + exp(-x));
+  return 1. / (1. + exp(-x));
 }
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	for (int i = 0; i < count; ++i) {
-		top_data[i] = sigmoid(bottom_data[i]);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = sigmoid(bottom_data[i]);
+  }
 }
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_data = top[0]->cpu_data();
-		const Dtype* top_diff = top[0]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const int count = bottom[0]->count();
-		for (int i = 0; i < count; ++i) {
-			const Dtype sigmoid_x = top_data[i];
-			bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    for (int i = 0; i < count; ++i) {
+      const Dtype sigmoid_x = top_data[i];
+      bottom_diff[i] = top_diff[i] * sigmoid_x * (1. - sigmoid_x);
+    }
+  }
 }
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	SigmoidForward(count, bottom_data, top_data);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_data = top[0]->gpu_data();
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const int count = bottom[0]->count();
-		// NOLINT_NEXT_LINE(whitespace/operators)
-		SigmoidBackward(count, top_diff, top_data, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SigmoidBackward(count, top_diff, top_data, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index 502d0aab..1c463499 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -8,30 +8,29 @@ namespace caffe {
 
 template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	for (int i = 0; i < bottom.size(); ++i) {
-		if (propagate_down[i]) {
-			caffe_set(bottom[i]->count(), Dtype(0),
-					bottom[i]->mutable_cpu_data());
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      caffe_set(bottom[i]->count(), Dtype(0), bottom[i]->mutable_cpu_data());
+    }
+  }
 }
 
 template <typename Dtype>
 void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// Do nothing.
+    const vector<Blob<Dtype>*>& top) {
+  // Do nothing.
 }
 
 template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	for (int i = 0; i < bottom.size(); ++i) {
-		if (propagate_down[i]) {
-			caffe_gpu_set(bottom[i]->count(), Dtype(0),
-					bottom[i]->mutable_gpu_data());
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      caffe_gpu_set(bottom[i]->count(), Dtype(0),
+          bottom[i]->mutable_gpu_data());
+    }
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index a005ceba..da4059a0 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -9,117 +9,116 @@ namespace caffe {
 
 template <typename Dtype>
 void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const SliceParameter& slice_param = this->layer_param_.slice_param();
-	CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
-			<< "Either axis or slice_dim should be specified; not both.";
-	slice_point_.clear();
-	std::copy(slice_param.slice_point().begin(),
-			slice_param.slice_point().end(),
-			std::back_inserter(slice_point_));
+    const vector<Blob<Dtype>*>& top) {
+  const SliceParameter& slice_param = this->layer_param_.slice_param();
+  CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
+      << "Either axis or slice_dim should be specified; not both.";
+  slice_point_.clear();
+  std::copy(slice_param.slice_point().begin(), slice_param.slice_point().end(),
+      std::back_inserter(slice_point_));
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const int num_axes = bottom[0]->num_axes();
-	const SliceParameter& slice_param = this->layer_param_.slice_param();
-	if (slice_param.has_slice_dim()) {
-		slice_axis_ = static_cast<int>(slice_param.slice_dim());
-		// Don't allow negative indexing for slice_dim, a uint32 -- almost
-		// certainly unintended.
-		CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
-				<< "produced negative result; slice_dim must satisfy "
-				<< "0 <= slice_dim < " << kMaxBlobAxes;
-		CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
-	} else {
-		slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
-	}
-	vector<int> top_shape = bottom[0]->shape();
-	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-	num_slices_ = bottom[0]->count(0, slice_axis_);
-	slice_size_ = bottom[0]->count(slice_axis_ + 1);
-	int count = 0;
-	if (slice_point_.size() != 0) {
-		CHECK_EQ(slice_point_.size(), top.size() - 1);
-		CHECK_LE(top.size(), bottom_slice_axis);
-		int prev = 0;
-		vector<int> slices;
-		for (int i = 0; i < slice_point_.size(); ++i) {
-			CHECK_GT(slice_point_[i], prev);
-			slices.push_back(slice_point_[i] - prev);
-			prev = slice_point_[i];
-		}
-		slices.push_back(bottom_slice_axis - prev);
-		for (int i = 0; i < top.size(); ++i) {
-			top_shape[slice_axis_] = slices[i];
-			top[i]->Reshape(top_shape);
-			count += top[i]->count();
-		}
-	} else {
-		CHECK_EQ(bottom_slice_axis % top.size(), 0)
-				<< "Number of top blobs (" << top.size() << ") should evenly "
-				<< "divide input slice axis (" << bottom_slice_axis << ")";
-		top_shape[slice_axis_] = bottom_slice_axis / top.size();
-		for (int i = 0; i < top.size(); ++i) {
-			top[i]->Reshape(top_shape);
-			count += top[i]->count();
-		}
-	}
-	CHECK_EQ(count, bottom[0]->count());
+    const vector<Blob<Dtype>*>& top) {
+  const int num_axes = bottom[0]->num_axes();
+  const SliceParameter& slice_param = this->layer_param_.slice_param();
+  if (slice_param.has_slice_dim()) {
+    slice_axis_ = static_cast<int>(slice_param.slice_dim());
+    // Don't allow negative indexing for slice_dim, a uint32 -- almost
+    // certainly unintended.
+    CHECK_GE(slice_axis_, 0) << "casting slice_dim from uint32 to int32 "
+        << "produced negative result; slice_dim must satisfy "
+        << "0 <= slice_dim < " << kMaxBlobAxes;
+    CHECK_LT(slice_axis_, num_axes) << "slice_dim out of range.";
+  } else {
+    slice_axis_ = bottom[0]->CanonicalAxisIndex(slice_param.axis());
+  }
+  vector<int> top_shape = bottom[0]->shape();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  num_slices_ = bottom[0]->count(0, slice_axis_);
+  slice_size_ = bottom[0]->count(slice_axis_ + 1);
+  int count = 0;
+  if (slice_point_.size() != 0) {
+    CHECK_EQ(slice_point_.size(), top.size() - 1);
+    CHECK_LE(top.size(), bottom_slice_axis);
+    int prev = 0;
+    vector<int> slices;
+    for (int i = 0; i < slice_point_.size(); ++i) {
+      CHECK_GT(slice_point_[i], prev);
+      slices.push_back(slice_point_[i] - prev);
+      prev = slice_point_[i];
+    }
+    slices.push_back(bottom_slice_axis - prev);
+    for (int i = 0; i < top.size(); ++i) {
+      top_shape[slice_axis_] = slices[i];
+      top[i]->Reshape(top_shape);
+      count += top[i]->count();
+    }
+  } else {
+    CHECK_EQ(bottom_slice_axis % top.size(), 0) << "Number of top blobs ("
+        << top.size() << ") should evenly " << "divide input slice axis ("
+        << bottom_slice_axis << ")";
+    top_shape[slice_axis_] = bottom_slice_axis / top.size();
+    for (int i = 0; i < top.size(); ++i) {
+      top[i]->Reshape(top_shape);
+      count += top[i]->count();
+    }
+  }
+  CHECK_EQ(count, bottom[0]->count());
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	int offset_slice_axis = 0;
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-	for (int i = 0; i < top.size(); ++i) {
-		Dtype* top_data = top[i]->mutable_cpu_data();
-		const int top_slice_axis = top[i]->shape(slice_axis_);
-		for (int n = 0; n < num_slices_; ++n) {
-			const int top_offset = n * top_slice_axis * slice_size_;
-			const int bottom_offset =
-					(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-			caffe_copy(top_slice_axis * slice_size_,
-					bottom_data + bottom_offset, top_data + top_offset);
-		}
-		offset_slice_axis += top_slice_axis;
-	}
+    const vector<Blob<Dtype>*>& top) {
+  int offset_slice_axis = 0;
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    Dtype* top_data = top[i]->mutable_cpu_data();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis)
+          * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_, bottom_data + bottom_offset,
+          top_data + top_offset);
+    }
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	int offset_slice_axis = 0;
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-	for (int i = 0; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->cpu_diff();
-		const int top_slice_axis = top[i]->shape(slice_axis_);
-		for (int n = 0; n < num_slices_; ++n) {
-			const int top_offset = n * top_slice_axis * slice_size_;
-			const int bottom_offset =
-					(n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-			caffe_copy(top_slice_axis * slice_size_,
-					top_diff + top_offset, bottom_diff + bottom_offset);
-		}
-		offset_slice_axis += top_slice_axis;
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  int offset_slice_axis = 0;
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->cpu_diff();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    for (int n = 0; n < num_slices_; ++n) {
+      const int top_offset = n * top_slice_axis * slice_size_;
+      const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis)
+          * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_, top_diff + top_offset,
+          bottom_diff + bottom_offset);
+    }
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index feb15321..92162821 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -9,19 +9,19 @@ namespace caffe {
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	softmax_axis_ =
-			bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
-	top[0]->ReshapeLike(*bottom[0]);
-	vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
-	sum_multiplier_.Reshape(mult_dims);
-	Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
-	caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
-	outer_num_ = bottom[0]->count(0, softmax_axis_);
-	inner_num_ = bottom[0]->count(softmax_axis_ + 1);
-	vector<int> scale_dims = bottom[0]->shape();
-	scale_dims[softmax_axis_] = 1;
-	scale_.Reshape(scale_dims);
+    const vector<Blob<Dtype>*>& top) {
+  softmax_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.softmax_param().axis());
+  top[0]->ReshapeLike(*bottom[0]);
+  vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
+  sum_multiplier_.Reshape(mult_dims);
+  Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
+  caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
+  outer_num_ = bottom[0]->count(0, softmax_axis_);
+  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+  vector<int> scale_dims = bottom[0]->shape();
+  scale_dims[softmax_axis_] = 1;
+  scale_.Reshape(scale_dims);
 }
 
 template <typename Dtype>
@@ -30,122 +30,120 @@ SoftmaxLayer<Dtype>::~SoftmaxLayer() {
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	Dtype* scale_data = scale_.mutable_cpu_data();
-	int channels = bottom[0]->shape(softmax_axis_);
-	int dim = bottom[0]->count() / outer_num_;
-	caffe_copy(bottom[0]->count(), bottom_data, top_data);
-	// We need to subtract the max to avoid numerical issues, compute the exp,
-	// and then normalize.
-	for (int i = 0; i < outer_num_; ++i) {
-		// initialize scale_data to the first plane
-		caffe_copy(inner_num_, bottom_data + i * dim, scale_data);
-		for (int j = 0; j < channels; j++) {
-			for (int k = 0; k < inner_num_; k++) {
-				scale_data[k] = std::max(scale_data[k],
-						bottom_data[i * dim + j * inner_num_ + k]);
-			}
-		}
-		// subtraction
-		caffe_cpu_gemm < Dtype > (CblasNoTrans, CblasNoTrans, channels, inner_num_,
-				1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
-		// exponentiation
-		caffe_exp < Dtype > (dim, top_data, top_data);
-		// sum after exp
-		caffe_cpu_gemv < Dtype > (CblasTrans, channels, inner_num_, 1.,
-				top_data, sum_multiplier_.cpu_data(), 0., scale_data);
-		// division
-		for (int j = 0; j < channels; j++) {
-			caffe_div(inner_num_, top_data, scale_data, top_data);
-			top_data += inner_num_;
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  int channels = bottom[0]->shape(softmax_axis_);
+  int dim = bottom[0]->count() / outer_num_;
+  caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  // We need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  for (int i = 0; i < outer_num_; ++i) {
+    // initialize scale_data to the first plane
+    caffe_copy(inner_num_, bottom_data + i * dim, scale_data);
+    for (int j = 0; j < channels; j++) {
+      for (int k = 0; k < inner_num_; k++) {
+        scale_data[k] = std::max(scale_data[k],
+            bottom_data[i * dim + j * inner_num_ + k]);
+      }
+    }
+    // subtraction
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
+    // exponentiation
+    caffe_exp < Dtype > (dim, top_data, top_data);
+    // sum after exp
+    caffe_cpu_gemv < Dtype
+        > (CblasTrans, channels, inner_num_, 1., top_data, sum_multiplier_.cpu_data(), 0., scale_data);
+    // division
+    for (int j = 0; j < channels; j++) {
+      caffe_div(inner_num_, top_data, scale_data, top_data);
+      top_data += inner_num_;
+    }
+  }
 }
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->cpu_diff();
-	const Dtype* top_data = top[0]->cpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-	Dtype* scale_data = scale_.mutable_cpu_data();
-	int channels = top[0]->shape(softmax_axis_);
-	int dim = top[0]->count() / outer_num_;
-	caffe_copy(top[0]->count(), top_diff, bottom_diff);
-	for (int i = 0; i < outer_num_; ++i) {
-		// compute dot(top_diff, top_data) and subtract them from the bottom diff
-		for (int k = 0; k < inner_num_; ++k) {
-			scale_data[k] = caffe_cpu_strided_dot < Dtype > (channels,
-					bottom_diff + i * dim + k, inner_num_,
-					top_data + i * dim + k, inner_num_);
-		}
-		// subtraction
-		caffe_cpu_gemm < Dtype
-				> (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
-						-1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff
-								+ i * dim);
-	}
-	// elementwise multiplication
-	caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* top_data = top[0]->cpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* scale_data = scale_.mutable_cpu_data();
+  int channels = top[0]->shape(softmax_axis_);
+  int dim = top[0]->count() / outer_num_;
+  caffe_copy(top[0]->count(), top_diff, bottom_diff);
+  for (int i = 0; i < outer_num_; ++i) {
+    // compute dot(top_diff, top_data) and subtract them from the bottom diff
+    for (int k = 0; k < inner_num_; ++k) {
+      scale_data[k] = caffe_cpu_strided_dot < Dtype
+          > (channels, bottom_diff + i * dim + k, inner_num_, top_data + i * dim
+              + k, inner_num_);
+    }
+    // subtraction
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff
+            + i * dim);
+  }
+  // elementwise multiplication
+  caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 // begin: code written/modified by AMD
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	Dtype* scale_data = scale_.mutable_gpu_data();
-	int count = bottom[0]->count();
-	int channels = top[0]->shape(softmax_axis_);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = bottom[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
 
-	caffe_gpu_copy(count, bottom_data, top_data);
-	// We need to subtract the max to avoid numerical issues, compute the exp,
-	// and then normalize.
-	// compute max
-	// NOLINT_NEXT_LINE(whitespace/operators)
+  caffe_gpu_copy(count, bottom_data, top_data);
+  // We need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  // compute max
+  // NOLINT_NEXT_LINE(whitespace/operators)
 
-	kernel_channel_max < Dtype > (outer_num_, channels, inner_num_, top_data,
-			scale_data);
-	// subtract
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
-			scale_data, top_data);
-	// exponentiate
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_exp < Dtype > (count, top_data, top_data);
-	// sum after exp
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_channel_sum < Dtype > (outer_num_, channels, inner_num_, top_data,
-			scale_data);
-	// divide
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_channel_div < Dtype > (count, outer_num_, channels, inner_num_,
-			scale_data, top_data);
+  kernel_channel_max < Dtype
+      > (outer_num_, channels, inner_num_, top_data, scale_data);
+  // subtract
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, top_data);
+  // exponentiate
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_exp < Dtype > (count, top_data, top_data);
+  // sum after exp
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_sum < Dtype
+      > (outer_num_, channels, inner_num_, top_data, scale_data);
+  // divide
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_div < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, top_data);
 }
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	const Dtype* top_diff = top[0]->gpu_diff();
-	const Dtype* top_data = top[0]->gpu_data();
-	Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-	Dtype* scale_data = scale_.mutable_gpu_data();
-	int count = top[0]->count();
-	int channels = top[0]->shape(softmax_axis_);
-	caffe_gpu_copy(count, top_diff, bottom_diff);
-	// Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-	// NOLINT_NEXT_LINE(whitespace/operators)
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = top[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
+  caffe_gpu_copy(count, top_diff, bottom_diff);
+  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
+  // NOLINT_NEXT_LINE(whitespace/operators)
 
-	kernel_channel_dot < Dtype > (outer_num_, channels, inner_num_,
-			top_diff, top_data, scale_data);
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_channel_subtract < Dtype > (count, outer_num_, channels, inner_num_,
-			scale_data, bottom_diff);
-	// elementwise multiplication
-	caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
+  kernel_channel_dot < Dtype
+      > (outer_num_, channels, inner_num_, top_diff, top_data, scale_data);
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, bottom_diff);
+  // elementwise multiplication
+  caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
 }
 // end: code written/modified by AMD
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 6b9e9e67..62c10e30 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -10,32 +10,31 @@
 namespace caffe {
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::LayerSetUp(bottom, top);
-	LayerParameter softmax_param(this->layer_param_);
-	softmax_param.set_type("Softmax");
-	softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param);
-	softmax_bottom_vec_.clear();
-	softmax_bottom_vec_.push_back(bottom[0]);
-	softmax_top_vec_.clear();
-	softmax_top_vec_.push_back(&prob_);
-	softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+void SoftmaxWithLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
+  LayerParameter softmax_param(this->layer_param_);
+  softmax_param.set_type("Softmax");
+  softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
 
-	has_ignore_label_ =
-			this->layer_param_.loss_param().has_ignore_label();
-	if (has_ignore_label_) {
-		ignore_label_ = this->layer_param_.loss_param().ignore_label();
-	}
-	normalize_ = this->layer_param_.loss_param().normalize();
+  has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  normalize_ = this->layer_param_.loss_param().normalize();
 
-	ocl_setup();
+  ocl_setup();
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::ocl_setup() {
-	d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
-			sizeof(Dtype), NULL, NULL);
+  d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+      sizeof(Dtype), NULL, NULL);
 
 }
 
@@ -44,160 +43,161 @@ SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer() {
 }
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Reshape(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	LossLayer < Dtype > ::Reshape(bottom, top);
-	softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
-	softmax_axis_ =
-			bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
-	outer_num_ = bottom[0]->count(0, softmax_axis_);
-	inner_num_ = bottom[0]->count(softmax_axis_ + 1);
-	CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
-			<< "Number of labels must match number of predictions; "
-			<< "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
-			<< "label count (number of labels) must be N*H*W, "
-			<< "with integer values in {0, 1, ..., C-1}.";
-	if (top.size() >= 2) {
-		// softmax output
-		top[1]->ReshapeLike(*bottom[0]);
-	}
+void SoftmaxWithLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  softmax_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.softmax_param().axis());
+  outer_num_ = bottom[0]->count(0, softmax_axis_);
+  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	// The forward pass computes the softmax prob values.
-	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-	const Dtype* prob_data = prob_.cpu_data();
-	const Dtype* label = bottom[1]->cpu_data();
-	int dim = prob_.count() / outer_num_;
-	int count = 0;
-	Dtype loss = 0;
-	for (int i = 0; i < outer_num_; ++i) {
-		for (int j = 0; j < inner_num_; j++) {
-			const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-			if (has_ignore_label_ && label_value == ignore_label_) {
-				continue;
-			}
-			DCHECK_GE(label_value, 0);
-			DCHECK_LT(label_value, prob_.shape(softmax_axis_));
-			loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
-					Dtype(FLT_MIN)));
-			++count;
-		}
-	}
-	if (normalize_) {
-		top[0]->mutable_cpu_data()[0] = loss / count;
-	} else {
-		top[0]->mutable_cpu_data()[0] = loss / outer_num_;
-	}
-	if (top.size() == 2) {
-		top[1]->ShareData(prob_);
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
+  const Dtype* label = bottom[1]->cpu_data();
+  int dim = prob_.count() / outer_num_;
+  int count = 0;
+  Dtype loss = 0;
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+      loss -= log(
+          std::max(prob_data[i * dim + label_value * inner_num_ + j],
+              Dtype(FLT_MIN)));
+      ++count;
+    }
+  }
+  if (normalize_) {
+    top[0]->mutable_cpu_data()[0] = loss / count;
+  } else {
+    top[0]->mutable_cpu_data()[0] = loss / outer_num_;
+  }
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const Dtype* prob_data = prob_.cpu_data();
-		caffe_copy(prob_.count(), prob_data, bottom_diff);
-		const Dtype* label = bottom[1]->cpu_data();
-		int dim = prob_.count() / outer_num_;
-		int count = 0;
-		for (int i = 0; i < outer_num_; ++i) {
-			for (int j = 0; j < inner_num_; ++j) {
-				const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-				if (has_ignore_label_ && label_value == ignore_label_) {
-					for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
-						bottom_diff[i * dim + c * inner_num_ + j] = 0;
-					}
-				} else {
-					bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
-					++count;
-				}
-			}
-		}
-		// Scale gradient
-		const Dtype loss_weight = top[0]->cpu_diff()[0];
-		if (normalize_) {
-			caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
-		} else {
-			caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const Dtype* prob_data = prob_.cpu_data();
+    caffe_copy(prob_.count(), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->cpu_data();
+    int dim = prob_.count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+            bottom_diff[i * dim + c * inner_num_ + j] = 0;
+          }
+        } else {
+          bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
+          ++count;
+        }
+      }
+    }
+    // Scale gradient
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    if (normalize_) {
+      caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+    }
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-		const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-	softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-	const Dtype* prob_data = prob_.gpu_data();
-	const Dtype* label = bottom[1]->gpu_data();
-	const int dim = prob_.count() / outer_num_;
-	const int nthreads = outer_num_ * inner_num_;
-	// Since this memory is not used for anything until it is overwritten
-	// on the backward pass, we use it here to avoid having to allocate new GPU
-	// memory to accumulate intermediate results in the kernel.
-	Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-	// Similarly, this memory is never used elsewhere, and thus we can use it
-	// to avoid having to allocate additional GPU memory.
-	Dtype* counts = prob_.mutable_gpu_diff();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	SoftmaxLossForwardGPU < Dtype > (nthreads, prob_data, label, loss_data,
-			outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-	Dtype loss;
-	caffe_gpu_asum(nthreads, loss_data, &loss);
-	if (normalize_) {
-		Dtype count;
-		caffe_gpu_asum(nthreads, counts, &count);
-		loss /= count;
-	} else {
-		loss /= outer_num_;
-	}
-	top[0]->mutable_cpu_data()[0] = loss;
-	if (top.size() == 2) {
-		top[1]->ShareData(prob_);
-	}
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.gpu_data();
+  const Dtype* label = bottom[1]->gpu_data();
+  const int dim = prob_.count() / outer_num_;
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // Similarly, this memory is never used elsewhere, and thus we can use it
+  // to avoid having to allocate additional GPU memory.
+  Dtype* counts = prob_.mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SoftmaxLossForwardGPU < Dtype
+      > (nthreads, prob_data, label, loss_data, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+  Dtype loss;
+  caffe_gpu_asum(nthreads, loss_data, &loss);
+  if (normalize_) {
+    Dtype count;
+    caffe_gpu_asum(nthreads, counts, &count);
+    loss /= count;
+  } else {
+    loss /= outer_num_;
+  }
+  top[0]->mutable_cpu_data()[0] = loss;
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[1]) {
-		LOG(FATAL) << this->type()
-				<< " Layer cannot backpropagate to label inputs.";
-	}
-	if (propagate_down[0]) {
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const Dtype* prob_data = prob_.gpu_data();
-		const Dtype* top_data = top[0]->gpu_data();
-		caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-		//caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
-		const Dtype* label = bottom[1]->gpu_data();
-		const int dim = prob_.count() / outer_num_;
-		const int nthreads = outer_num_ * inner_num_;
-		// Since this memory is never used for anything else,
-		// we use to to avoid allocating new GPU memory.
-		Dtype* counts = prob_.mutable_gpu_diff();
-		// NOLINT_NEXT_LINE(whitespace/operators)
-		SoftmaxLossBackwardGPU < Dtype > (nthreads, top_data, label, bottom_diff,
-				outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-		const Dtype loss_weight = top[0]->cpu_diff()[0];
-		if (normalize_) {
-			Dtype count;
-			caffe_gpu_asum(nthreads, counts, &count);
-			caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-		} else {
-			caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* prob_data = prob_.gpu_data();
+    const Dtype* top_data = top[0]->gpu_data();
+    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->gpu_data();
+    const int dim = prob_.count() / outer_num_;
+    const int nthreads = outer_num_ * inner_num_;
+    // Since this memory is never used for anything else,
+    // we use to to avoid allocating new GPU memory.
+    Dtype* counts = prob_.mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SoftmaxLossBackwardGPU < Dtype
+        > (nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    if (normalize_) {
+      Dtype count;
+      caffe_gpu_asum(nthreads, counts, &count);
+      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+    }
+  }
 }
 // end: code written/modified by AMD
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 54bea0d6..7a40bf8a 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -8,78 +8,78 @@ namespace caffe {
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	count_ = bottom[0]->count();
-	for (int i = 0; i < top.size(); ++i) {
-		// Do not allow in-place computation in the SplitLayer.  Instead, share data
-		// by reference in the forward pass, and keep separate diff allocations in
-		// the backward pass.  (Technically, it should be possible to share the diff
-		// blob of the first split output with the input, but this seems to cause
-		// some strange effects in practice...)
-		CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
-				"allow in-place computation.";
-		top[i]->ReshapeLike(*bottom[0]);
-		CHECK_EQ(count_, top[i]->count());
-	}
-	gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float",
-			NULL);
+    const vector<Blob<Dtype>*>& top) {
+  count_ = bottom[0]->count();
+  for (int i = 0; i < top.size(); ++i) {
+    // Do not allow in-place computation in the SplitLayer.  Instead, share data
+    // by reference in the forward pass, and keep separate diff allocations in
+    // the backward pass.  (Technically, it should be possible to share the diff
+    // blob of the first split output with the input, but this seems to cause
+    // some strange effects in practice...)
+    CHECK_NE(top[i], bottom[0]) << this->type() << " Layer does not "
+        "allow in-place computation.";
+    top[i]->ReshapeLike(*bottom[0]);
+    CHECK_EQ(count_, top[i]->count());
+  }
+  gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float",
+      NULL);
 }
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	for (int i = 0; i < top.size(); ++i) {
-		top[i]->ShareData(*bottom[0]);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ShareData(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	if (top.size() == 1) {
-		caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff());
-		return;
-	}
-	caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
-			bottom[0]->mutable_cpu_diff());
-	// Add remaining top blob diffs.
-	for (int i = 2; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  if (top.size() == 1) {
+    caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff());
+    return;
+  }
+  caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
+      bottom[0]->mutable_cpu_diff());
+  // Add remaining top blob diffs.
+  for (int i = 2; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+  }
 }
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	for (int i = 0; i < top.size(); ++i) {
-		top[i]->ShareData(*bottom[0]);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ShareData(*bottom[0]);
+  }
 }
 
 // begin: code written/modified by AMD
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	if (top.size() == 1) {
-		caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-		return;
-	}
-	caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-			bottom[0]->mutable_gpu_diff());
-	// Add remaining top blob diffs.
-	for (int i = 2; i < top.size(); ++i) {
-		const Dtype* top_diff = top[i]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  if (top.size() == 1) {
+    caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
+    return;
+  }
+  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
+      bottom[0]->mutable_gpu_diff());
+  // Add remaining top blob diffs.
+  for (int i = 2; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+  }
 }
 // end: code written/modified by AMD
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
index 4c630fb7..d552af61 100644
--- a/src/caffe/layers/spp_layer.cpp
+++ b/src/caffe/layers/spp_layer.cpp
@@ -15,175 +15,172 @@ using std::max;
 
 template <typename Dtype>
 LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
-		const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
-	LayerParameter pooling_param;
-	int num_bins = pow(2, pyramid_level);
-
-	// find padding and kernel size so that the pooling is
-	// performed across the entire image
-	int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
-	// remainder_h is the min number of pixels that need to be padded before
-	// entire image height is pooled over with the chosen kernel dimension
-	int remainder_h = kernel_h * num_bins - bottom_h;
-	// pooling layer pads (2 * pad_h) pixels on the top and bottom of the
-	// image.
-	int pad_h = (remainder_h + 1) / 2;
-
-	// similar logic for width
-	int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
-	int remainder_w = kernel_w * num_bins - bottom_w;
-	int pad_w = (remainder_w + 1) / 2;
-
-	pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
-	pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
-	pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
-	pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
-	pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
-	pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
-
-	switch (spp_param.pool()) {
-		case SPPParameter_PoolMethod_MAX:
-			pooling_param.mutable_pooling_param()->set_pool(
-					PoolingParameter_PoolMethod_MAX);
-			break;
-		case SPPParameter_PoolMethod_AVE:
-			pooling_param.mutable_pooling_param()->set_pool(
-					PoolingParameter_PoolMethod_AVE);
-			break;
-		case SPPParameter_PoolMethod_STOCHASTIC:
-			pooling_param.mutable_pooling_param()->set_pool(
-					PoolingParameter_PoolMethod_STOCHASTIC);
-			break;
-		default:
-			LOG(FATAL) << "Unknown pooling method.";
-	}
-
-	return pooling_param;
+    const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+  LayerParameter pooling_param;
+  int num_bins = pow(2, pyramid_level);
+
+  // find padding and kernel size so that the pooling is
+  // performed across the entire image
+  int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
+  // remainder_h is the min number of pixels that need to be padded before
+  // entire image height is pooled over with the chosen kernel dimension
+  int remainder_h = kernel_h * num_bins - bottom_h;
+  // pooling layer pads (2 * pad_h) pixels on the top and bottom of the
+  // image.
+  int pad_h = (remainder_h + 1) / 2;
+
+  // similar logic for width
+  int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
+  int remainder_w = kernel_w * num_bins - bottom_w;
+  int pad_w = (remainder_w + 1) / 2;
+
+  pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
+  pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
+  pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
+  pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
+
+  switch (spp_param.pool()) {
+  case SPPParameter_PoolMethod_MAX:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_MAX);
+    break;
+  case SPPParameter_PoolMethod_AVE:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_AVE);
+    break;
+  case SPPParameter_PoolMethod_STOCHASTIC:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_STOCHASTIC);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+
+  return pooling_param;
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	SPPParameter spp_param = this->layer_param_.spp_param();
-
-	bottom_h_ = bottom[0]->height();
-	bottom_w_ = bottom[0]->width();
-	CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
-	CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
-
-	pyramid_height_ = spp_param.pyramid_height();
-	split_top_vec_.clear();
-	pooling_bottom_vecs_.clear();
-	pooling_layers_.clear();
-	pooling_top_vecs_.clear();
-	pooling_outputs_.clear();
-	flatten_layers_.clear();
-	flatten_top_vecs_.clear();
-	flatten_outputs_.clear();
-	concat_bottom_vec_.clear();
-
-	// split layer output holders setup
-	for (int i = 0; i < pyramid_height_; i++) {
-		split_top_vec_.push_back(new Blob<Dtype>());
-	}
-
-	// split layer setup
-	LayerParameter split_param;
-	split_layer_.reset(new SplitLayer<Dtype>(split_param));
-	split_layer_->SetUp(bottom, split_top_vec_);
-
-	for (int i = 0; i < pyramid_height_; i++) {
-		// pooling layer input holders setup
-		pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
-		pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
-
-		// pooling layer output holders setup
-		pooling_outputs_.push_back(new Blob<Dtype>());
-		pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
-		pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
-
-		// pooling layer setup
-		LayerParameter pooling_param = GetPoolingParam(
-				i, bottom_h_, bottom_w_, spp_param);
-
-		pooling_layers_.push_back(shared_ptr < PoolingLayer<Dtype> > (
-				new PoolingLayer<Dtype>(pooling_param)));
-		pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-
-		// flatten layer output holders setup
-		flatten_outputs_.push_back(new Blob<Dtype>());
-		flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
-		flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
-
-		// flatten layer setup
-		LayerParameter flatten_param;
-		flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
-		flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-
-		// concat layer input holders setup
-		concat_bottom_vec_.push_back(flatten_outputs_[i]);
-	}
-
-	// concat layer setup
-	LayerParameter concat_param;
-	concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
-	concat_layer_->SetUp(concat_bottom_vec_, top);
+    const vector<Blob<Dtype>*>& top) {
+  SPPParameter spp_param = this->layer_param_.spp_param();
+
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
+  CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
+
+  pyramid_height_ = spp_param.pyramid_height();
+  split_top_vec_.clear();
+  pooling_bottom_vecs_.clear();
+  pooling_layers_.clear();
+  pooling_top_vecs_.clear();
+  pooling_outputs_.clear();
+  flatten_layers_.clear();
+  flatten_top_vecs_.clear();
+  flatten_outputs_.clear();
+  concat_bottom_vec_.clear();
+
+  // split layer output holders setup
+  for (int i = 0; i < pyramid_height_; i++) {
+    split_top_vec_.push_back(new Blob<Dtype>());
+  }
+
+  // split layer setup
+  LayerParameter split_param;
+  split_layer_.reset(new SplitLayer<Dtype>(split_param));
+  split_layer_->SetUp(bottom, split_top_vec_);
+
+  for (int i = 0; i < pyramid_height_; i++) {
+    // pooling layer input holders setup
+    pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
+
+    // pooling layer output holders setup
+    pooling_outputs_.push_back(new Blob<Dtype>());
+    pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
+
+    // pooling layer setup
+    LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_,
+        spp_param);
+
+    pooling_layers_.push_back(
+        shared_ptr < PoolingLayer<Dtype>
+            > (new PoolingLayer<Dtype>(pooling_param)));
+    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+
+    // flatten layer output holders setup
+    flatten_outputs_.push_back(new Blob<Dtype>());
+    flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
+
+    // flatten layer setup
+    LayerParameter flatten_param;
+    flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
+    flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+
+    // concat layer input holders setup
+    concat_bottom_vec_.push_back(flatten_outputs_[i]);
+  }
+
+  // concat layer setup
+  LayerParameter concat_param;
+  concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
+  concat_layer_->SetUp(concat_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-			<< "corresponding to (num, channels, height, width)";
-	channels_ = bottom[0]->channels();
-	bottom_h_ = bottom[0]->height();
-	bottom_w_ = bottom[0]->width();
-	SPPParameter spp_param = this->layer_param_.spp_param();
-	split_layer_->Reshape(bottom, split_top_vec_);
-	for (int i = 0; i < pyramid_height_; i++) {
-		LayerParameter pooling_param = GetPoolingParam(
-				i, bottom_h_, bottom_w_, spp_param);
-
-		pooling_layers_[i].reset(
-				new PoolingLayer<Dtype>(pooling_param));
-		pooling_layers_[i]->SetUp(
-				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-		pooling_layers_[i]->Reshape(
-				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-		flatten_layers_[i]->Reshape(
-				*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-	}
-	concat_layer_->Reshape(concat_bottom_vec_, top);
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  SPPParameter spp_param = this->layer_param_.spp_param();
+  split_layer_->Reshape(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_,
+        spp_param);
+
+    pooling_layers_[i].reset(new PoolingLayer<Dtype>(pooling_param));
+    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Reshape(*pooling_bottom_vecs_[i],
+        *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Reshape(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Reshape(concat_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	split_layer_->Forward(bottom, split_top_vec_);
-	for (int i = 0; i < pyramid_height_; i++) {
-		pooling_layers_[i]->Forward(
-				*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-		flatten_layers_[i]->Forward(
-				*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
-	}
-	concat_layer_->Forward(concat_bottom_vec_, top);
+    const vector<Blob<Dtype>*>& top) {
+  split_layer_->Forward(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    pooling_layers_[i]->Forward(*pooling_bottom_vecs_[i],
+        *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Forward(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Forward(concat_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (!propagate_down[0]) {
-		return;
-	}
-	vector<bool> concat_propagate_down(pyramid_height_, true);
-	concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
-	for (int i = 0; i < pyramid_height_; i++) {
-		flatten_layers_[i]->Backward(
-				*flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
-		pooling_layers_[i]->Backward(
-				*pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
-	}
-	split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  vector<bool> concat_propagate_down(pyramid_height_, true);
+  concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    flatten_layers_[i]->Backward(*flatten_top_vecs_[i], propagate_down,
+        *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Backward(*pooling_top_vecs_[i], propagate_down,
+        *pooling_bottom_vecs_[i]);
+  }
+  split_layer_->Backward(split_top_vec_, propagate_down, bottom);
 }
 
 INSTANTIATE_CLASS (SPPLayer);
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 52a8a8c7..3e85330c 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -12,53 +12,52 @@ namespace caffe {
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	for (int i = 0; i < count; ++i) {
-		top_data[i] = tanh(bottom_data[i]);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = tanh(bottom_data[i]);
+  }
 }
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_data = top[0]->cpu_data();
-		const Dtype* top_diff = top[0]->cpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-		const int count = bottom[0]->count();
-		Dtype tanhx;
-		for (int i = 0; i < count; ++i) {
-			tanhx = top_data[i];
-			bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx);
-		}
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype tanhx;
+    for (int i = 0; i < count; ++i) {
+      tanhx = top_data[i];
+      bottom_diff[i] = top_diff[i] * (1 - tanhx * tanhx);
+    }
+  }
 }
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	TanHForward(count, bottom_data, top_data);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  TanHForward(count, bottom_data, top_data);
 }
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_data = top[0]->gpu_data();
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		const int count = bottom[0]->count();
-		// NOLINT_NEXT_LINE(whitespace/operators)
-		TanHBackward(count, top_diff, top_data, bottom_diff);
-	}
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    TanHBackward(count, top_diff, top_data, bottom_diff);
+  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 7d99226f..16ca8944 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -8,30 +8,30 @@ namespace caffe {
 
 template <typename Dtype>
 void ThresholdLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
-	threshold_ = this->layer_param_.threshold_param().threshold();
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+  threshold_ = this->layer_param_.threshold_param().threshold();
 }
 
 template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->cpu_data();
-	Dtype* top_data = top[0]->mutable_cpu_data();
-	const int count = bottom[0]->count();
-	for (int i = 0; i < count; ++i) {
-		top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0);
-	}
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = (bottom_data[i] > threshold_) ? Dtype(1) : Dtype(0);
+  }
 }
 
 template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	ThresholdForward(count, threshold_, bottom_data, top_data);
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ThresholdForward(count, threshold_, bottom_data, top_data);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 68b1b1e5..7085ac63 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -27,406 +27,400 @@ namespace caffe {
 
 template <typename Dtype>
 WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
-	this->JoinPrefetchThread();
+  this->JoinPrefetchThread();
 }
 
 template <typename Dtype>
 void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	// LayerSetUp runs through the window_file and creates two structures
-	// that hold windows: one for foreground (object) windows and one
-	// for background (non-object) windows. We use an overlap threshold
-	// to decide which is which.
-
-	// window_file format
-	// repeated:
-	//    # image_index
-	//    img_path (abs path)
-	//    channels
-	//    height
-	//    width
-	//    num_windows
-	//    class_index overlap x1 y1 x2 y2
-
-	LOG(INFO) << "Window data layer:" << std::endl
-			<< "  foreground (object) overlap threshold: "
-			<< this->layer_param_.window_data_param().fg_threshold() << std::endl
-			<< "  background (non-object) overlap threshold: "
-			<< this->layer_param_.window_data_param().bg_threshold() << std::endl
-			<< "  foreground sampling fraction: "
-			<< this->layer_param_.window_data_param().fg_fraction() << std::endl
-			<< "  cache_images: "
-			<< this->layer_param_.window_data_param().cache_images() << std::endl
-			<< "  root_folder: "
-			<< this->layer_param_.window_data_param().root_folder();
-
-	cache_images_ = this->layer_param_.window_data_param().cache_images();
-	string root_folder = this->layer_param_.window_data_param().root_folder();
-
-	const bool prefetch_needs_rand =
-			this->transform_param_.mirror() ||
-					this->transform_param_.crop_size();
-	if (prefetch_needs_rand) {
-		const unsigned int prefetch_rng_seed = caffe_rng_rand();
-		prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
-	} else {
-		prefetch_rng_.reset();
-	}
-
-	std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
-	CHECK(infile.good()) << "Failed to open window file "
-			<< this->layer_param_.window_data_param().source() << std::endl;
-
-	map<int, int> label_hist;
-	label_hist.insert(std::make_pair(0, 0));
-
-	string hashtag;
-	int image_index, channels;
-	if (!(infile >> hashtag >> image_index)) {
-		LOG(FATAL) << "Window file is empty";
-	}
-	do {
-		CHECK_EQ(hashtag, "#");
-		// read image path
-		string image_path;
-		infile >> image_path;
-		image_path = root_folder + image_path;
-		// read image dimensions
-		vector<int> image_size(3);
-		infile >> image_size[0] >> image_size[1] >> image_size[2];
-		channels = image_size[0];
-		image_database_.push_back(std::make_pair(image_path, image_size));
-
-		if (cache_images_) {
-			Datum datum;
-			if (!ReadFileToDatum(image_path, &datum)) {
-				LOG(ERROR) << "Could not open or find file " << image_path;
-				return;
-			}
-			image_database_cache_.push_back(std::make_pair(image_path, datum));
-		}
-		// read each box
-		int num_windows;
-		infile >> num_windows;
-		const float fg_threshold =
-				this->layer_param_.window_data_param().fg_threshold();
-		const float bg_threshold =
-				this->layer_param_.window_data_param().bg_threshold();
-		for (int i = 0; i < num_windows; ++i) {
-			int label, x1, y1, x2, y2;
-			float overlap;
-			infile >> label >> overlap >> x1 >> y1 >> x2 >> y2;
-
-			vector<float> window(WindowDataLayer::NUM);
-			window[WindowDataLayer::IMAGE_INDEX] = image_index;
-			window[WindowDataLayer::LABEL] = label;
-			window[WindowDataLayer::OVERLAP] = overlap;
-			window[WindowDataLayer::X1] = x1;
-			window[WindowDataLayer::Y1] = y1;
-			window[WindowDataLayer::X2] = x2;
-			window[WindowDataLayer::Y2] = y2;
-
-			// add window to foreground list or background list
-			if (overlap >= fg_threshold) {
-				int label = window[WindowDataLayer::LABEL];
-				CHECK_GT(label, 0);
-				fg_windows_.push_back(window);
-				label_hist.insert(std::make_pair(label, 0));
-				label_hist[label]++;
-			} else if (overlap < bg_threshold) {
-				// background window, force label and overlap to 0
-				window[WindowDataLayer::LABEL] = 0;
-				window[WindowDataLayer::OVERLAP] = 0;
-				bg_windows_.push_back(window);
-				label_hist[0]++;
-			}
-		}
-
-		if (image_index % 100 == 0) {
-			LOG(INFO) << "num: " << image_index << " "
-					<< image_path << " "
-					<< image_size[0] << " "
-					<< image_size[1] << " "
-					<< image_size[2] << " "
-					<< "windows to process: " << num_windows;
-		}
-	} while (infile >> hashtag >> image_index);
-
-	LOG(INFO) << "Number of images: " << image_index + 1;
-
-	for (map<int, int>::iterator it = label_hist.begin();
-			it != label_hist.end(); ++it) {
-		LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
-				<< " samples";
-	}
-
-	LOG(INFO) << "Amount of context padding: "
-			<< this->layer_param_.window_data_param().context_pad();
-
-	LOG(INFO) << "Crop mode: "
-			<< this->layer_param_.window_data_param().crop_mode();
-
-	// image
-	const int crop_size = this->transform_param_.crop_size();
-	CHECK_GT(crop_size, 0);
-	const int batch_size = this->layer_param_.window_data_param().batch_size();
-	top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-	this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
-
-	LOG(INFO) << "output data size: " << top[0]->num() << ","
-			<< top[0]->channels() << "," << top[0]->height() << ","
-			<< top[0]->width();
-	// label
-	vector<int> label_shape(1, batch_size);
-	top[1]->Reshape(label_shape);
-	this->prefetch_label_.Reshape(label_shape);
-
-	// data mean
-	has_mean_file_ = this->transform_param_.has_mean_file();
-	has_mean_values_ = this->transform_param_.mean_value_size() > 0;
-	if (has_mean_file_) {
-		const string& mean_file =
-				this->transform_param_.mean_file();
-		LOG(INFO) << "Loading mean file from: " << mean_file;
-		BlobProto blob_proto;
-		ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
-		data_mean_.FromProto(blob_proto);
-	}
-	if (has_mean_values_) {
-		CHECK(has_mean_file_ == false) <<
-				"Cannot specify mean_file and mean_value at the same time";
-		for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
-			mean_values_.push_back(this->transform_param_.mean_value(c));
-		}
-		CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
-				"Specify either 1 mean_value or as many as channels: " << channels;
-		if (channels > 1 && mean_values_.size() == 1) {
-			// Replicate the mean_value for simplicity
-			for (int c = 1; c < channels; ++c) {
-				mean_values_.push_back(mean_values_[0]);
-			}
-		}
-	}
+    const vector<Blob<Dtype>*>& top) {
+  // LayerSetUp runs through the window_file and creates two structures
+  // that hold windows: one for foreground (object) windows and one
+  // for background (non-object) windows. We use an overlap threshold
+  // to decide which is which.
+
+  // window_file format
+  // repeated:
+  //    # image_index
+  //    img_path (abs path)
+  //    channels
+  //    height
+  //    width
+  //    num_windows
+  //    class_index overlap x1 y1 x2 y2
+
+  LOG(INFO) << "Window data layer:" << std::endl
+      << "  foreground (object) overlap threshold: "
+      << this->layer_param_.window_data_param().fg_threshold() << std::endl
+      << "  background (non-object) overlap threshold: "
+      << this->layer_param_.window_data_param().bg_threshold() << std::endl
+      << "  foreground sampling fraction: "
+      << this->layer_param_.window_data_param().fg_fraction() << std::endl
+      << "  cache_images: "
+      << this->layer_param_.window_data_param().cache_images() << std::endl
+      << "  root_folder: "
+      << this->layer_param_.window_data_param().root_folder();
+
+  cache_images_ = this->layer_param_.window_data_param().cache_images();
+  string root_folder = this->layer_param_.window_data_param().root_folder();
+
+  const bool prefetch_needs_rand = this->transform_param_.mirror()
+      || this->transform_param_.crop_size();
+  if (prefetch_needs_rand) {
+    const unsigned int prefetch_rng_seed = caffe_rng_rand();
+    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+  } else {
+    prefetch_rng_.reset();
+  }
+
+  std::ifstream infile(this->layer_param_.window_data_param().source().c_str());
+  CHECK(infile.good()) << "Failed to open window file "
+      << this->layer_param_.window_data_param().source() << std::endl;
+
+  map<int, int> label_hist;
+  label_hist.insert(std::make_pair(0, 0));
+
+  string hashtag;
+  int image_index, channels;
+  if (!(infile >> hashtag >> image_index)) {
+    LOG(FATAL) << "Window file is empty";
+  }
+  do {
+    CHECK_EQ(hashtag, "#");
+    // read image path
+    string image_path;
+    infile >> image_path;
+    image_path = root_folder + image_path;
+    // read image dimensions
+    vector<int> image_size(3);
+    infile >> image_size[0] >> image_size[1] >> image_size[2];
+    channels = image_size[0];
+    image_database_.push_back(std::make_pair(image_path, image_size));
+
+    if (cache_images_) {
+      Datum datum;
+      if (!ReadFileToDatum(image_path, &datum)) {
+        LOG(ERROR) << "Could not open or find file " << image_path;
+        return;
+      }
+      image_database_cache_.push_back(std::make_pair(image_path, datum));
+    }
+    // read each box
+    int num_windows;
+    infile >> num_windows;
+    const float fg_threshold =
+        this->layer_param_.window_data_param().fg_threshold();
+    const float bg_threshold =
+        this->layer_param_.window_data_param().bg_threshold();
+    for (int i = 0; i < num_windows; ++i) {
+      int label, x1, y1, x2, y2;
+      float overlap;
+      infile >> label >> overlap >> x1 >> y1 >> x2 >> y2;
+
+      vector<float> window(WindowDataLayer::NUM);
+      window[WindowDataLayer::IMAGE_INDEX] = image_index;
+      window[WindowDataLayer::LABEL] = label;
+      window[WindowDataLayer::OVERLAP] = overlap;
+      window[WindowDataLayer::X1] = x1;
+      window[WindowDataLayer::Y1] = y1;
+      window[WindowDataLayer::X2] = x2;
+      window[WindowDataLayer::Y2] = y2;
+
+      // add window to foreground list or background list
+      if (overlap >= fg_threshold) {
+        int label = window[WindowDataLayer::LABEL];
+        CHECK_GT(label, 0);
+        fg_windows_.push_back(window);
+        label_hist.insert(std::make_pair(label, 0));
+        label_hist[label]++;
+      } else if (overlap < bg_threshold) {
+        // background window, force label and overlap to 0
+        window[WindowDataLayer::LABEL] = 0;
+        window[WindowDataLayer::OVERLAP] = 0;
+        bg_windows_.push_back(window);
+        label_hist[0]++;
+      }
+    }
+
+    if (image_index % 100 == 0) {
+      LOG(INFO) << "num: " << image_index << " " << image_path << " "
+          << image_size[0] << " " << image_size[1] << " " << image_size[2]
+          << " " << "windows to process: " << num_windows;
+    }
+  } while (infile >> hashtag >> image_index);
+
+  LOG(INFO) << "Number of images: " << image_index + 1;
+
+  for (map<int, int>::iterator it = label_hist.begin(); it != label_hist.end();
+      ++it) {
+    LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
+        << " samples";
+  }
+
+  LOG(INFO) << "Amount of context padding: "
+      << this->layer_param_.window_data_param().context_pad();
+
+  LOG(INFO) << "Crop mode: "
+      << this->layer_param_.window_data_param().crop_mode();
+
+  // image
+  const int crop_size = this->transform_param_.crop_size();
+  CHECK_GT(crop_size, 0);
+  const int batch_size = this->layer_param_.window_data_param().batch_size();
+  top[0]->Reshape(batch_size, channels, crop_size, crop_size);
+  this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
+
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  vector<int> label_shape(1, batch_size);
+  top[1]->Reshape(label_shape);
+  this->prefetch_label_.Reshape(label_shape);
+
+  // data mean
+  has_mean_file_ = this->transform_param_.has_mean_file();
+  has_mean_values_ = this->transform_param_.mean_value_size() > 0;
+  if (has_mean_file_) {
+    const string& mean_file = this->transform_param_.mean_file();
+    LOG(INFO) << "Loading mean file from: " << mean_file;
+    BlobProto blob_proto;
+    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+    data_mean_.FromProto(blob_proto);
+  }
+  if (has_mean_values_) {
+    CHECK(has_mean_file_ == false)
+        << "Cannot specify mean_file and mean_value at the same time";
+    for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
+      mean_values_.push_back(this->transform_param_.mean_value(c));
+    }
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels)
+        << "Specify either 1 mean_value or as many as channels: " << channels;
+    if (channels > 1 && mean_values_.size() == 1) {
+      // Replicate the mean_value for simplicity
+      for (int c = 1; c < channels; ++c) {
+        mean_values_.push_back(mean_values_[0]);
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
-	CHECK (prefetch_rng_);
-	caffe::rng_t* prefetch_rng =
-			static_cast<caffe::rng_t*>(prefetch_rng_->generator());
-	return (*prefetch_rng)();
+  CHECK (prefetch_rng_);
+  caffe::rng_t* prefetch_rng =
+      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+  return (*prefetch_rng)();
 }
 
 // Thread fetching the data
 template <typename Dtype>
 void WindowDataLayer<Dtype>::InternalThreadEntry() {
-	// At each iteration, sample N windows where N*p are foreground (object)
-	// windows and N*(1-p) are background (non-object) windows
-	CPUTimer batch_timer;
-	batch_timer.Start();
-	double read_time = 0;
-	double trans_time = 0;
-	CPUTimer timer;
-	Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
-	Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
-	const Dtype scale = this->layer_param_.window_data_param().scale();
-	const int batch_size = this->layer_param_.window_data_param().batch_size();
-	const int context_pad = this->layer_param_.window_data_param().context_pad();
-	const int crop_size = this->transform_param_.crop_size();
-	const bool mirror = this->transform_param_.mirror();
-	const float fg_fraction =
-			this->layer_param_.window_data_param().fg_fraction();
-	Dtype* mean = NULL;
-	int mean_off = 0;
-	int mean_width = 0;
-	int mean_height = 0;
-	if (this->has_mean_file_) {
-		mean = this->data_mean_.mutable_cpu_data();
-		mean_off = (this->data_mean_.width() - crop_size) / 2;
-		mean_width = this->data_mean_.width();
-		mean_height = this->data_mean_.height();
-	}
-	cv::Size cv_crop_size(crop_size, crop_size);
-	const string& crop_mode = this->layer_param_.window_data_param().crop_mode();
-
-	bool use_square = (crop_mode == "square") ? true : false;
-
-	// zero out batch
-	caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
-
-	const int num_fg = static_cast<int>(static_cast<float>(batch_size)
-			* fg_fraction);
-	const int num_samples[2] = { batch_size - num_fg, num_fg };
-
-	int item_id = 0;
-	// sample from bg set then fg set
-	for (int is_fg = 0; is_fg < 2; ++is_fg) {
-		for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
-			// sample a window
-			timer.Start();
-			const unsigned int rand_index = PrefetchRand();
-			vector<float> window =
-					(is_fg) ?
-										fg_windows_[rand_index % fg_windows_.size()] :
-										bg_windows_[rand_index % bg_windows_.size()];
-
-			bool do_mirror = mirror && PrefetchRand() % 2;
-
-			// load the image containing the window
-			pair<std::string, vector<int> > image =
-					image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
-
-			cv::Mat cv_img;
-			if (this->cache_images_) {
-				pair < std::string, Datum > image_cached =
-						image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
-				cv_img = DecodeDatumToCVMat(image_cached.second, true);
-			} else {
-				cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
-				if (!cv_img.data) {
-					LOG(ERROR) << "Could not open or find file " << image.first;
-					return;
-				}
-			}
-			read_time += timer.MicroSeconds();
-			timer.Start();
-			const int channels = cv_img.channels();
-
-			// crop window out of image and warp it
-			int x1 = window[WindowDataLayer < Dtype > ::X1];
-			int y1 = window[WindowDataLayer < Dtype > ::Y1];
-			int x2 = window[WindowDataLayer < Dtype > ::X2];
-			int y2 = window[WindowDataLayer < Dtype > ::Y2];
-
-			int pad_w = 0;
-			int pad_h = 0;
-			if (context_pad > 0 || use_square) {
-				// scale factor by which to expand the original region
-				// such that after warping the expanded region to crop_size x crop_size
-				// there's exactly context_pad amount of padding on each side
-				Dtype context_scale = static_cast<Dtype>(crop_size) /
-						static_cast<Dtype>(crop_size - 2 * context_pad);
-
-				// compute the expanded region
-				Dtype half_height = static_cast<Dtype>(y2 - y1 + 1) / 2.0;
-				Dtype half_width = static_cast<Dtype>(x2 - x1 + 1) / 2.0;
-				Dtype center_x = static_cast<Dtype>(x1) + half_width;
-				Dtype center_y = static_cast<Dtype>(y1) + half_height;
-				if (use_square) {
-					if (half_height > half_width) {
-						half_width = half_height;
-					} else {
-						half_height = half_width;
-					}
-				}
-				x1 = static_cast<int>(round(center_x - half_width * context_scale));
-				x2 = static_cast<int>(round(center_x + half_width * context_scale));
-				y1 = static_cast<int>(round(center_y - half_height * context_scale));
-				y2 = static_cast<int>(round(center_y + half_height * context_scale));
-
-				// the expanded region may go outside of the image
-				// so we compute the clipped (expanded) region and keep track of
-				// the extent beyond the image
-				int unclipped_height = y2 - y1 + 1;
-				int unclipped_width = x2 - x1 + 1;
-				int pad_x1 = std::max(0, -x1);
-				int pad_y1 = std::max(0, -y1);
-				int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
-				int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
-				// clip bounds
-				x1 = x1 + pad_x1;
-				x2 = x2 - pad_x2;
-				y1 = y1 + pad_y1;
-				y2 = y2 - pad_y2;
-				CHECK_GT(x1, -1);
-				CHECK_GT(y1, -1);
-				CHECK_LT(x2, cv_img.cols);
-				CHECK_LT(y2, cv_img.rows);
-
-				int clipped_height = y2 - y1 + 1;
-				int clipped_width = x2 - x1 + 1;
-
-				// scale factors that would be used to warp the unclipped
-				// expanded region
-				Dtype scale_x =
-						static_cast<Dtype>(crop_size) / static_cast<Dtype>(unclipped_width);
-				Dtype scale_y =
-						static_cast<Dtype>(crop_size)
-								/ static_cast<Dtype>(unclipped_height);
-
-				// size to warp the clipped expanded region to
-				cv_crop_size.width =
-						static_cast<int>(round(static_cast<Dtype>(clipped_width) * scale_x));
-				cv_crop_size.height =
-						static_cast<int>(round(static_cast<Dtype>(clipped_height) * scale_y));
-				pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1) * scale_x));
-				pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2) * scale_x));
-				pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1) * scale_y));
-				pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2) * scale_y));
-
-				pad_h = pad_y1;
-				// if we're mirroring, we mirror the padding too (to be pedantic)
-				if (do_mirror) {
-					pad_w = pad_x2;
-				} else {
-					pad_w = pad_x1;
-				}
-
-				// ensure that the warped, clipped region plus the padding fits in the
-				// crop_size x crop_size image (it might not due to rounding)
-				if (pad_h + cv_crop_size.height > crop_size) {
-					cv_crop_size.height = crop_size - pad_h;
-				}
-				if (pad_w + cv_crop_size.width > crop_size) {
-					cv_crop_size.width = crop_size - pad_w;
-				}
-			}
-
-			cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1);
-			cv::Mat cv_cropped_img = cv_img(roi);
-			cv::resize(cv_cropped_img, cv_cropped_img,
-					cv_crop_size, 0, 0, cv::INTER_LINEAR);
-
-			// horizontal flip at random
-			if (do_mirror) {
-				cv::flip(cv_cropped_img, cv_cropped_img, 1);
-			}
-
-			// copy the warped window into top_data
-			for (int h = 0; h < cv_cropped_img.rows; ++h) {
-				const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
-				int img_index = 0;
-				for (int w = 0; w < cv_cropped_img.cols; ++w) {
-					for (int c = 0; c < channels; ++c) {
-						int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
-								* crop_size + w + pad_w;
-						// int top_index = (c * height + h) * width + w;
-						Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
-						if (this->has_mean_file_) {
-							int mean_index = (c * mean_height + h + mean_off + pad_h)
-									* mean_width + w + mean_off + pad_w;
-							top_data[top_index] = (pixel - mean[mean_index]) * scale;
-						} else {
-							if (this->has_mean_values_) {
-								top_data[top_index] = (pixel - this->mean_values_[c]) * scale;
-							} else {
-								top_data[top_index] = pixel * scale;
-							}
-						}
-					}
-				}
-			}
-			trans_time += timer.MicroSeconds();
-			// get window label
-			top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL];
-
-			item_id++;
-		}
-	}
-	batch_timer.Stop();
-	DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
-	DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
-	DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+  // At each iteration, sample N windows where N*p are foreground (object)
+  // windows and N*(1-p) are background (non-object) windows
+  CPUTimer batch_timer;
+  batch_timer.Start();
+  double read_time = 0;
+  double trans_time = 0;
+  CPUTimer timer;
+  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
+  const Dtype scale = this->layer_param_.window_data_param().scale();
+  const int batch_size = this->layer_param_.window_data_param().batch_size();
+  const int context_pad = this->layer_param_.window_data_param().context_pad();
+  const int crop_size = this->transform_param_.crop_size();
+  const bool mirror = this->transform_param_.mirror();
+  const float fg_fraction =
+      this->layer_param_.window_data_param().fg_fraction();
+  Dtype* mean = NULL;
+  int mean_off = 0;
+  int mean_width = 0;
+  int mean_height = 0;
+  if (this->has_mean_file_) {
+    mean = this->data_mean_.mutable_cpu_data();
+    mean_off = (this->data_mean_.width() - crop_size) / 2;
+    mean_width = this->data_mean_.width();
+    mean_height = this->data_mean_.height();
+  }
+  cv::Size cv_crop_size(crop_size, crop_size);
+  const string& crop_mode = this->layer_param_.window_data_param().crop_mode();
+
+  bool use_square = (crop_mode == "square") ? true : false;
+
+  // zero out batch
+  caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
+
+  const int num_fg = static_cast<int>(static_cast<float>(batch_size)
+      * fg_fraction);
+  const int num_samples[2] = { batch_size - num_fg, num_fg };
+
+  int item_id = 0;
+  // sample from bg set then fg set
+  for (int is_fg = 0; is_fg < 2; ++is_fg) {
+    for (int dummy = 0; dummy < num_samples[is_fg]; ++dummy) {
+      // sample a window
+      timer.Start();
+      const unsigned int rand_index = PrefetchRand();
+      vector<float> window =
+          (is_fg) ?
+              fg_windows_[rand_index % fg_windows_.size()] :
+              bg_windows_[rand_index % bg_windows_.size()];
+
+      bool do_mirror = mirror && PrefetchRand() % 2;
+
+      // load the image containing the window
+      pair<std::string, vector<int> > image =
+          image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+
+      cv::Mat cv_img;
+      if (this->cache_images_) {
+        pair < std::string, Datum > image_cached =
+            image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
+        cv_img = DecodeDatumToCVMat(image_cached.second, true);
+      } else {
+        cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
+        if (!cv_img.data) {
+          LOG(ERROR) << "Could not open or find file " << image.first;
+          return;
+        }
+      }
+      read_time += timer.MicroSeconds();
+      timer.Start();
+      const int channels = cv_img.channels();
+
+      // crop window out of image and warp it
+      int x1 = window[WindowDataLayer < Dtype > ::X1];
+      int y1 = window[WindowDataLayer < Dtype > ::Y1];
+      int x2 = window[WindowDataLayer < Dtype > ::X2];
+      int y2 = window[WindowDataLayer < Dtype > ::Y2];
+
+      int pad_w = 0;
+      int pad_h = 0;
+      if (context_pad > 0 || use_square) {
+        // scale factor by which to expand the original region
+        // such that after warping the expanded region to crop_size x crop_size
+        // there's exactly context_pad amount of padding on each side
+        Dtype context_scale = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(crop_size - 2 * context_pad);
+
+        // compute the expanded region
+        Dtype half_height = static_cast<Dtype>(y2 - y1 + 1) / 2.0;
+        Dtype half_width = static_cast<Dtype>(x2 - x1 + 1) / 2.0;
+        Dtype center_x = static_cast<Dtype>(x1) + half_width;
+        Dtype center_y = static_cast<Dtype>(y1) + half_height;
+        if (use_square) {
+          if (half_height > half_width) {
+            half_width = half_height;
+          } else {
+            half_height = half_width;
+          }
+        }
+        x1 = static_cast<int>(round(center_x - half_width * context_scale));
+        x2 = static_cast<int>(round(center_x + half_width * context_scale));
+        y1 = static_cast<int>(round(center_y - half_height * context_scale));
+        y2 = static_cast<int>(round(center_y + half_height * context_scale));
+
+        // the expanded region may go outside of the image
+        // so we compute the clipped (expanded) region and keep track of
+        // the extent beyond the image
+        int unclipped_height = y2 - y1 + 1;
+        int unclipped_width = x2 - x1 + 1;
+        int pad_x1 = std::max(0, -x1);
+        int pad_y1 = std::max(0, -y1);
+        int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
+        int pad_y2 = std::max(0, y2 - cv_img.rows + 1);
+        // clip bounds
+        x1 = x1 + pad_x1;
+        x2 = x2 - pad_x2;
+        y1 = y1 + pad_y1;
+        y2 = y2 - pad_y2;
+        CHECK_GT(x1, -1);
+        CHECK_GT(y1, -1);
+        CHECK_LT(x2, cv_img.cols);
+        CHECK_LT(y2, cv_img.rows);
+
+        int clipped_height = y2 - y1 + 1;
+        int clipped_width = x2 - x1 + 1;
+
+        // scale factors that would be used to warp the unclipped
+        // expanded region
+        Dtype scale_x = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(unclipped_width);
+        Dtype scale_y = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(unclipped_height);
+
+        // size to warp the clipped expanded region to
+        cv_crop_size.width = static_cast<int>(round(
+            static_cast<Dtype>(clipped_width) * scale_x));
+        cv_crop_size.height = static_cast<int>(round(
+            static_cast<Dtype>(clipped_height) * scale_y));
+        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1) * scale_x));
+        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2) * scale_x));
+        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1) * scale_y));
+        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2) * scale_y));
+
+        pad_h = pad_y1;
+        // if we're mirroring, we mirror the padding too (to be pedantic)
+        if (do_mirror) {
+          pad_w = pad_x2;
+        } else {
+          pad_w = pad_x1;
+        }
+
+        // ensure that the warped, clipped region plus the padding fits in the
+        // crop_size x crop_size image (it might not due to rounding)
+        if (pad_h + cv_crop_size.height > crop_size) {
+          cv_crop_size.height = crop_size - pad_h;
+        }
+        if (pad_w + cv_crop_size.width > crop_size) {
+          cv_crop_size.width = crop_size - pad_w;
+        }
+      }
+
+      cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1);
+      cv::Mat cv_cropped_img = cv_img(roi);
+      cv::resize(cv_cropped_img, cv_cropped_img, cv_crop_size, 0, 0,
+          cv::INTER_LINEAR);
+
+      // horizontal flip at random
+      if (do_mirror) {
+        cv::flip(cv_cropped_img, cv_cropped_img, 1);
+      }
+
+      // copy the warped window into top_data
+      for (int h = 0; h < cv_cropped_img.rows; ++h) {
+        const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
+        int img_index = 0;
+        for (int w = 0; w < cv_cropped_img.cols; ++w) {
+          for (int c = 0; c < channels; ++c) {
+            int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
+                * crop_size + w + pad_w;
+            // int top_index = (c * height + h) * width + w;
+            Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
+            if (this->has_mean_file_) {
+              int mean_index = (c * mean_height + h + mean_off + pad_h)
+                  * mean_width + w + mean_off + pad_w;
+              top_data[top_index] = (pixel - mean[mean_index]) * scale;
+            } else {
+              if (this->has_mean_values_) {
+                top_data[top_index] = (pixel - this->mean_values_[c]) * scale;
+              } else {
+                top_data[top_index] = pixel * scale;
+              }
+            }
+          }
+        }
+      }
+      trans_time += timer.MicroSeconds();
+      // get window label
+      top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL];
+
+      item_id++;
+    }
+  }
+  batch_timer.Stop();
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
 INSTANTIATE_CLASS (WindowDataLayer);
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 23085112..6911854c 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -21,897 +21,886 @@ namespace caffe {
 
 template <typename Dtype>
 Net<Dtype>::Net(const NetParameter& param) {
-	Init(param);
+  Init(param);
 }
 
 template <typename Dtype>
 Net<Dtype>::Net(const string& param_file, Phase phase) {
-	NetParameter param;
-	ReadNetParamsFromTextFileOrDie(param_file, &param);
-	param.mutable_state()->set_phase(phase);
-	Init(param);
+  NetParameter param;
+  ReadNetParamsFromTextFileOrDie(param_file, &param);
+  param.mutable_state()->set_phase(phase);
+  Init(param);
 }
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-	// Set phase from the state.
-	phase_ = in_param.state().phase();
-	// Filter layers based on their include/exclude rules and
-	// the current NetState.
-	NetParameter filtered_param;
-	FilterNet(in_param, &filtered_param);
-	LOG(INFO) << "Initializing net from parameters: " << std::endl
-			<< filtered_param.DebugString();
-	// Create a copy of filtered_param with splits added where necessary.
-	NetParameter param;
-	InsertSplits(filtered_param, &param);
-	// Basically, build all the layers and set up their connections.
-	name_ = param.name();
-	map<string, int> blob_name_to_idx;
-	set < string > available_blobs;
-	CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
-			<< "Must specify either input_shape OR deprecated input_dim, not both.";
-	if (param.input_dim_size() > 0) {
-		// Deprecated 4D dimensions.
-		CHECK_EQ(param.input_size() * 4, param.input_dim_size())
-				<< "Incorrect input blob dimension specifications.";
-	} else {
-		CHECK_EQ(param.input_size(), param.input_shape_size())
-				<< "Exactly one input_shape must be specified per input.";
-	}
-	memory_used_ = 0;
-	// set the input blobs
-	for (int input_id = 0; input_id < param.input_size(); ++input_id) {
-		const int layer_id = -1;  // inputs have fake layer ID -1
-		AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
-	}
-	DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-	// For each layer, set up its input and output
-	bottom_vecs_.resize(param.layer_size());
-	top_vecs_.resize(param.layer_size());
-	bottom_id_vecs_.resize(param.layer_size());
-	param_id_vecs_.resize(param.layer_size());
-	top_id_vecs_.resize(param.layer_size());
-	bottom_need_backward_.resize(param.layer_size());
-	for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-		// Inherit phase from net if unset.
-		if (!param.layer(layer_id).has_phase()) {
-			param.mutable_layer(layer_id)->set_phase(phase_);
-		}
-		// Setup layer.
-		const LayerParameter& layer_param = param.layer(layer_id);
-		if (layer_param.propagate_down_size() > 0) {
-			CHECK_EQ(layer_param.propagate_down_size(),
-					layer_param.bottom_size())
-					<< "propagate_down param must be specified "
-					<< "either 0 or bottom_size times ";
-		}
-		layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param));
-		layer_names_.push_back(layer_param.name());
-		LOG(INFO) << "Creating Layer " << layer_param.name();
-		bool need_backward = false;
-
-		// Figure out this layer's input and output
-		for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
-				++bottom_id) {
-			const int blob_id = AppendBottom(param, layer_id, bottom_id,
-					&available_blobs, &blob_name_to_idx);
-			// If a blob needs backward, this layer should provide it.
-			need_backward |= blob_need_backward_[blob_id];
-		}
-		int num_top = layer_param.top_size();
-		for (int top_id = 0; top_id < num_top; ++top_id) {
-			AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
-		}
-		// If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
-		// specified fewer than the required number (as specified by
-		// ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
-		Layer < Dtype > *layer = layers_[layer_id].get();
-		if (layer->AutoTopBlobs()) {
-			const int needed_num_top =
-					std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
-			for (; num_top < needed_num_top; ++num_top) {
-				// Add "anonymous" top blobs -- do not modify available_blobs or
-				// blob_name_to_idx as we don't want these blobs to be usable as input
-				// to other layers.
-				AppendTop(param, layer_id, num_top, NULL, NULL);
-			}
-		}
-		// After this layer is connected, set it up.
-		LOG(INFO) << "Setting up " << layer_names_[layer_id];
-		layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-		for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-			if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
-				blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
-			}
-			blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-			LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
-			if (layer->loss(top_id)) {
-				LOG(INFO) << "    with loss weight " << layer->loss(top_id);
-			}
-			memory_used_ += top_vecs_[layer_id][top_id]->count();
-		}
-		DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
-		const int param_size = layer_param.param_size();
-		const int num_param_blobs = layers_[layer_id]->blobs().size();
-		CHECK_LE(param_size, num_param_blobs)
-				<< "Too many params specified for layer " << layer_param.name();
-		ParamSpec default_param_spec;
-		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-			const ParamSpec* param_spec =
-					(param_id < param_size) ?
-																		&layer_param.param(param_id) :
-																		&default_param_spec;
-			const bool param_need_backward = param_spec->lr_mult() > 0;
-			need_backward |= param_need_backward;
-			layers_[layer_id]->set_param_propagate_down(param_id,
-					param_need_backward);
-		}
-		for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-			AppendParam(param, layer_id, param_id);
-		}
-		// Finally, set the backward flag
-		layer_need_backward_.push_back(need_backward);
-		if (need_backward) {
-			for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
-				blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
-			}
-		}
-	}
-	// Go through the net backwards to determine which blobs contribute to the
-	// loss.  We can skip backward computation for blobs that don't contribute
-	// to the loss.
-	// Also checks if all bottom blobs don't need backward computation (possible
-	// because the skip_propagate_down param) and so we can skip bacward
-	// computation for the entire layer
-	set < string > blobs_under_loss;
-	set < string > blobs_skip_backp;
-	for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
-		bool layer_contributes_loss = false;
-		bool layer_skip_propagate_down = true;
-		for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-			const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-			if (layers_[layer_id]->loss(top_id) ||
-					(blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
-				layer_contributes_loss = true;
-			}
-			if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
-				layer_skip_propagate_down = false;
-			}
-			if (layer_contributes_loss && !layer_skip_propagate_down)
-				break;
-		}
-		// If this layer can skip backward computation, also all his bottom blobs
-		// don't need backpropagation
-		if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
-			layer_need_backward_[layer_id] = false;
-			for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-					++bottom_id) {
-				bottom_need_backward_[layer_id][bottom_id] = false;
-			}
-		}
-		if (!layer_contributes_loss) {
-			layer_need_backward_[layer_id] = false;
-		}
-		if (layer_need_backward_[layer_id]) {
-			LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
-		} else {
-			LOG(INFO) << layer_names_[layer_id]
-					<< " does not need backward computation.";
-		}
-		for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-				++bottom_id) {
-			if (layer_contributes_loss) {
-				const string& blob_name =
-						blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-				blobs_under_loss.insert(blob_name);
-			} else {
-				bottom_need_backward_[layer_id][bottom_id] = false;
-			}
-			if (!bottom_need_backward_[layer_id][bottom_id]) {
-				const string& blob_name =
-						blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-				blobs_skip_backp.insert(blob_name);
-			}
-		}
-	}
-	// Handle force_backward if needed.
-	if (param.force_backward()) {
-		for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
-			layer_need_backward_[layer_id] = true;
-			for (int bottom_id = 0;
-					bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
-				bottom_need_backward_[layer_id][bottom_id] =
-						bottom_need_backward_[layer_id][bottom_id] ||
-								layers_[layer_id]->AllowForceBackward(bottom_id);
-				blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
-						blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
-								bottom_need_backward_[layer_id][bottom_id];
-			}
-			for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-					++param_id) {
-				layers_[layer_id]->set_param_propagate_down(param_id, true);
-			}
-		}
-	}
-	// In the end, all remaining blobs are considered output blobs.
-	for (set<string>::iterator it = available_blobs.begin();
-			it != available_blobs.end(); ++it) {
-		LOG(INFO) << "This network produces output " << *it;
-		net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
-		net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
-	}
-	for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
-		blob_names_index_[blob_names_[blob_id]] = blob_id;
-	}
-	for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
-		layer_names_index_[layer_names_[layer_id]] = layer_id;
-	}
-	GetLearningRateAndWeightDecay();
-	debug_info_ = param.debug_info();
-	LOG(INFO) << "Network initialization done.";
-	LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  // Set phase from the state.
+  phase_ = in_param.state().phase();
+  // Filter layers based on their include/exclude rules and
+  // the current NetState.
+  NetParameter filtered_param;
+  FilterNet(in_param, &filtered_param);
+  LOG(INFO) << "Initializing net from parameters: " << std::endl
+      << filtered_param.DebugString();
+  // Create a copy of filtered_param with splits added where necessary.
+  NetParameter param;
+  InsertSplits(filtered_param, &param);
+  // Basically, build all the layers and set up their connections.
+  name_ = param.name();
+  map<string, int> blob_name_to_idx;
+  set < string > available_blobs;
+  CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
+      << "Must specify either input_shape OR deprecated input_dim, not both.";
+  if (param.input_dim_size() > 0) {
+    // Deprecated 4D dimensions.
+    CHECK_EQ(param.input_size() * 4, param.input_dim_size())
+        << "Incorrect input blob dimension specifications.";
+  } else {
+    CHECK_EQ(param.input_size(), param.input_shape_size())
+        << "Exactly one input_shape must be specified per input.";
+  }
+  memory_used_ = 0;
+  // set the input blobs
+  for (int input_id = 0; input_id < param.input_size(); ++input_id) {
+    const int layer_id = -1;  // inputs have fake layer ID -1
+    AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
+  }
+  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  // For each layer, set up its input and output
+  bottom_vecs_.resize(param.layer_size());
+  top_vecs_.resize(param.layer_size());
+  bottom_id_vecs_.resize(param.layer_size());
+  param_id_vecs_.resize(param.layer_size());
+  top_id_vecs_.resize(param.layer_size());
+  bottom_need_backward_.resize(param.layer_size());
+  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
+    // Inherit phase from net if unset.
+    if (!param.layer(layer_id).has_phase()) {
+      param.mutable_layer(layer_id)->set_phase(phase_);
+    }
+    // Setup layer.
+    const LayerParameter& layer_param = param.layer(layer_id);
+    if (layer_param.propagate_down_size() > 0) {
+      CHECK_EQ(layer_param.propagate_down_size(), layer_param.bottom_size())
+          << "propagate_down param must be specified "
+          << "either 0 or bottom_size times ";
+    }
+    layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param));
+    layer_names_.push_back(layer_param.name());
+    LOG(INFO) << "Creating Layer " << layer_param.name();
+    bool need_backward = false;
+
+    // Figure out this layer's input and output
+    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
+        ++bottom_id) {
+      const int blob_id = AppendBottom(param, layer_id, bottom_id,
+          &available_blobs, &blob_name_to_idx);
+      // If a blob needs backward, this layer should provide it.
+      need_backward |= blob_need_backward_[blob_id];
+    }
+    int num_top = layer_param.top_size();
+    for (int top_id = 0; top_id < num_top; ++top_id) {
+      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
+    }
+    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
+    // specified fewer than the required number (as specified by
+    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
+    Layer < Dtype > *layer = layers_[layer_id].get();
+    if (layer->AutoTopBlobs()) {
+      const int needed_num_top = std::max(layer->MinTopBlobs(),
+          layer->ExactNumTopBlobs());
+      for (; num_top < needed_num_top; ++num_top) {
+        // Add "anonymous" top blobs -- do not modify available_blobs or
+        // blob_name_to_idx as we don't want these blobs to be usable as input
+        // to other layers.
+        AppendTop(param, layer_id, num_top, NULL, NULL);
+      }
+    }
+    // After this layer is connected, set it up.
+    LOG(INFO) << "Setting up " << layer_names_[layer_id];
+    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
+    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
+        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
+      }
+      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
+      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
+      if (layer->loss(top_id)) {
+        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
+      }
+      memory_used_ += top_vecs_[layer_id][top_id]->count();
+    }
+    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+    const int param_size = layer_param.param_size();
+    const int num_param_blobs = layers_[layer_id]->blobs().size();
+    CHECK_LE(param_size, num_param_blobs)
+        << "Too many params specified for layer " << layer_param.name();
+    ParamSpec default_param_spec;
+    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+      const ParamSpec* param_spec =
+          (param_id < param_size) ?
+              &layer_param.param(param_id) : &default_param_spec;
+      const bool param_need_backward = param_spec->lr_mult() > 0;
+      need_backward |= param_need_backward;
+      layers_[layer_id]->set_param_propagate_down(param_id,
+          param_need_backward);
+    }
+    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
+      AppendParam(param, layer_id, param_id);
+    }
+    // Finally, set the backward flag
+    layer_need_backward_.push_back(need_backward);
+    if (need_backward) {
+      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
+        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
+      }
+    }
+  }
+  // Go through the net backwards to determine which blobs contribute to the
+  // loss.  We can skip backward computation for blobs that don't contribute
+  // to the loss.
+  // Also checks if all bottom blobs don't need backward computation (possible
+  // because the skip_propagate_down param) and so we can skip bacward
+  // computation for the entire layer
+  set < string > blobs_under_loss;
+  set < string > blobs_skip_backp;
+  for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
+    bool layer_contributes_loss = false;
+    bool layer_skip_propagate_down = true;
+    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+      const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+      if (layers_[layer_id]->loss(top_id)
+          || (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
+        layer_contributes_loss = true;
+      }
+      if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
+        layer_skip_propagate_down = false;
+      }
+      if (layer_contributes_loss && !layer_skip_propagate_down)
+        break;
+    }
+    // If this layer can skip backward computation, also all his bottom blobs
+    // don't need backpropagation
+    if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
+      layer_need_backward_[layer_id] = false;
+      for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+          ++bottom_id) {
+        bottom_need_backward_[layer_id][bottom_id] = false;
+      }
+    }
+    if (!layer_contributes_loss) {
+      layer_need_backward_[layer_id] = false;
+    }
+    if (layer_need_backward_[layer_id]) {
+      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
+    } else {
+      LOG(INFO) << layer_names_[layer_id]
+          << " does not need backward computation.";
+    }
+    for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
+        ++bottom_id) {
+      if (layer_contributes_loss) {
+        const string& blob_name =
+            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+        blobs_under_loss.insert(blob_name);
+      } else {
+        bottom_need_backward_[layer_id][bottom_id] = false;
+      }
+      if (!bottom_need_backward_[layer_id][bottom_id]) {
+        const string& blob_name =
+            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+        blobs_skip_backp.insert(blob_name);
+      }
+    }
+  }
+  // Handle force_backward if needed.
+  if (param.force_backward()) {
+    for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
+      layer_need_backward_[layer_id] = true;
+      for (int bottom_id = 0;
+          bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
+        bottom_need_backward_[layer_id][bottom_id] =
+            bottom_need_backward_[layer_id][bottom_id]
+                || layers_[layer_id]->AllowForceBackward(bottom_id);
+        blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
+            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]]
+                || bottom_need_backward_[layer_id][bottom_id];
+      }
+      for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+          ++param_id) {
+        layers_[layer_id]->set_param_propagate_down(param_id, true);
+      }
+    }
+  }
+  // In the end, all remaining blobs are considered output blobs.
+  for (set<string>::iterator it = available_blobs.begin();
+      it != available_blobs.end(); ++it) {
+    LOG(INFO) << "This network produces output " << *it;
+    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
+    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
+  }
+  for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
+    blob_names_index_[blob_names_[blob_id]] = blob_id;
+  }
+  for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
+    layer_names_index_[layer_names_[layer_id]] = layer_id;
+  }
+  GetLearningRateAndWeightDecay();
+  debug_info_ = param.debug_info();
+  LOG(INFO) << "Network initialization done.";
+  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
 }
 
 template <typename Dtype>
 void Net<Dtype>::FilterNet(const NetParameter& param,
-		NetParameter* param_filtered) {
-	NetState net_state(param.state());
-	param_filtered->CopyFrom(param);
-	param_filtered->clear_layer();
-	for (int i = 0; i < param.layer_size(); ++i) {
-		const LayerParameter& layer_param = param.layer(i);
-		const string& layer_name = layer_param.name();
-		CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
-				<< "Specify either include rules or exclude rules; not both.";
-		// If no include rules are specified, the layer is included by default and
-		// only excluded if it meets one of the exclude rules.
-		bool layer_included = (layer_param.include_size() == 0);
-		for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
-			if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
-				layer_included = false;
-			}
-		}
-		for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
-			if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
-				layer_included = true;
-			}
-		}
-		if (layer_included) {
-			param_filtered->add_layer()->CopyFrom(layer_param);
-		}
-	}
-}
-
-template <typename Dtype>
-bool Net<Dtype>::StateMeetsRule(const NetState& state,
-		const NetStateRule& rule, const string& layer_name) {
-	// Check whether the rule is broken due to phase.
-	if (rule.has_phase()) {
-		if (rule.phase() != state.phase()) {
-			LOG(INFO) << "The NetState phase (" << state.phase()
-					<< ") differed from the phase (" << rule.phase()
-					<< ") specified by a rule in layer " << layer_name;
-			return false;
-		}
-	}
-	// Check whether the rule is broken due to min level.
-	if (rule.has_min_level()) {
-		if (state.level() < rule.min_level()) {
-			LOG(INFO) << "The NetState level (" << state.level()
-					<< ") is above the min_level (" << rule.min_level()
-					<< ") specified by a rule in layer " << layer_name;
-			return false;
-		}
-	}
-	// Check whether the rule is broken due to max level.
-	if (rule.has_max_level()) {
-		if (state.level() > rule.max_level()) {
-			LOG(INFO) << "The NetState level (" << state.level()
-					<< ") is above the max_level (" << rule.max_level()
-					<< ") specified by a rule in layer " << layer_name;
-			return false;
-		}
-	}
-	// Check whether the rule is broken due to stage. The NetState must
-	// contain ALL of the rule's stages to meet it.
-	for (int i = 0; i < rule.stage_size(); ++i) {
-		// Check that the NetState contains the rule's ith stage.
-		bool has_stage = false;
-		for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-			if (rule.stage(i) == state.stage(j)) {
-				has_stage = true;
-			}
-		}
-		if (!has_stage) {
-			LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-					<< "' specified by a rule in layer " << layer_name;
-			return false;
-		}
-	}
-	// Check whether the rule is broken due to not_stage. The NetState must
-	// contain NONE of the rule's not_stages to meet it.
-	for (int i = 0; i < rule.not_stage_size(); ++i) {
-		// Check that the NetState contains the rule's ith not_stage.
-		bool has_stage = false;
-		for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-			if (rule.not_stage(i) == state.stage(j)) {
-				has_stage = true;
-			}
-		}
-		if (has_stage) {
-			LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-					<< "' specified by a rule in layer " << layer_name;
-			return false;
-		}
-	}
-	return true;
+    NetParameter* param_filtered) {
+  NetState net_state(param.state());
+  param_filtered->CopyFrom(param);
+  param_filtered->clear_layer();
+  for (int i = 0; i < param.layer_size(); ++i) {
+    const LayerParameter& layer_param = param.layer(i);
+    const string& layer_name = layer_param.name();
+    CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
+        << "Specify either include rules or exclude rules; not both.";
+    // If no include rules are specified, the layer is included by default and
+    // only excluded if it meets one of the exclude rules.
+    bool layer_included = (layer_param.include_size() == 0);
+    for (int j = 0; layer_included && j < layer_param.exclude_size(); ++j) {
+      if (StateMeetsRule(net_state, layer_param.exclude(j), layer_name)) {
+        layer_included = false;
+      }
+    }
+    for (int j = 0; !layer_included && j < layer_param.include_size(); ++j) {
+      if (StateMeetsRule(net_state, layer_param.include(j), layer_name)) {
+        layer_included = true;
+      }
+    }
+    if (layer_included) {
+      param_filtered->add_layer()->CopyFrom(layer_param);
+    }
+  }
+}
+
+template <typename Dtype>
+bool Net<Dtype>::StateMeetsRule(const NetState& state, const NetStateRule& rule,
+    const string& layer_name) {
+  // Check whether the rule is broken due to phase.
+  if (rule.has_phase()) {
+    if (rule.phase() != state.phase()) {
+      LOG(INFO) << "The NetState phase (" << state.phase()
+          << ") differed from the phase (" << rule.phase()
+          << ") specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to min level.
+  if (rule.has_min_level()) {
+    if (state.level() < rule.min_level()) {
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the min_level (" << rule.min_level()
+          << ") specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to max level.
+  if (rule.has_max_level()) {
+    if (state.level() > rule.max_level()) {
+      LOG(INFO) << "The NetState level (" << state.level()
+          << ") is above the max_level (" << rule.max_level()
+          << ") specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to stage. The NetState must
+  // contain ALL of the rule's stages to meet it.
+  for (int i = 0; i < rule.stage_size(); ++i) {
+    // Check that the NetState contains the rule's ith stage.
+    bool has_stage = false;
+    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+      if (rule.stage(i) == state.stage(j)) {
+        has_stage = true;
+      }
+    }
+    if (!has_stage) {
+      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
+          << "' specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  // Check whether the rule is broken due to not_stage. The NetState must
+  // contain NONE of the rule's not_stages to meet it.
+  for (int i = 0; i < rule.not_stage_size(); ++i) {
+    // Check that the NetState contains the rule's ith not_stage.
+    bool has_stage = false;
+    for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
+      if (rule.not_stage(i) == state.stage(j)) {
+        has_stage = true;
+      }
+    }
+    if (has_stage) {
+      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
+          << "' specified by a rule in layer " << layer_name;
+      return false;
+    }
+  }
+  return true;
 }
 
 // Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
 // layer_id == -1, tops have layer_id >= 0.)
 template <typename Dtype>
 void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
-		const int top_id, set<string>* available_blobs,
-		map<string, int>* blob_name_to_idx) {
-	shared_ptr < LayerParameter
-			> layer_param(
-					(layer_id >= 0) ?
-														(new LayerParameter(param.layer(layer_id))) :
-														NULL);
-	const string& blob_name =
-			layer_param ?
-					(layer_param->top_size() > top_id ?
-																							layer_param->top(top_id) :
-																							"(automatic)") :
-					param.input(top_id);
-	// Check if we are doing in-place computation
-	if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
-			blob_name == layer_param->bottom(top_id)) {
-		// In-place computation
-		LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
-		top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
-		top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
-	} else if (blob_name_to_idx &&
-			blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
-		// If we are not doing in-place computation but have duplicated blobs,
-		// raise an error.
-		LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
-	} else {
-		// Normal output.
-		if (layer_param) {
-			LOG(INFO) << layer_param->name() << " -> " << blob_name;
-		} else {
-			LOG(INFO) << "Input " << top_id << " -> " << blob_name;
-		}
-		shared_ptr < Blob<Dtype> > blob_pointer(new Blob<Dtype>());
-		const int blob_id = blobs_.size();
-		blobs_.push_back(blob_pointer);
-		blob_names_.push_back(blob_name);
-		blob_need_backward_.push_back(false);
-		if (blob_name_to_idx) {
-			(*blob_name_to_idx)[blob_name] = blob_id;
-		}
-		if (layer_id == -1) {
-			// Set the (explicitly specified) dimensions of the input blob.
-			if (param.input_dim_size() > 0) {
-				blob_pointer->Reshape(param.input_dim(top_id * 4),
-						param.input_dim(top_id * 4 + 1),
-						param.input_dim(top_id * 4 + 2),
-						param.input_dim(top_id * 4 + 3));
-			} else {
-				blob_pointer->Reshape(param.input_shape(top_id));
-			}
-			net_input_blob_indices_.push_back(blob_id);
-			net_input_blobs_.push_back(blob_pointer.get());
-		} else {
-			top_id_vecs_[layer_id].push_back(blob_id);
-			top_vecs_[layer_id].push_back(blob_pointer.get());
-		}
-	}
-	if (available_blobs) {
-		available_blobs->insert(blob_name);
-	}
+    const int top_id, set<string>* available_blobs,
+    map<string, int>* blob_name_to_idx) {
+  shared_ptr < LayerParameter
+      > layer_param(
+          (layer_id >= 0) ? (new LayerParameter(param.layer(layer_id))) : NULL);
+  const string& blob_name =
+      layer_param ?
+          (layer_param->top_size() > top_id ?
+              layer_param->top(top_id) : "(automatic)") :
+          param.input(top_id);
+  // Check if we are doing in-place computation
+  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id
+      && blob_name == layer_param->bottom(top_id)) {
+    // In-place computation
+    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
+    top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
+    top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
+  } else if (blob_name_to_idx
+      && blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
+    // If we are not doing in-place computation but have duplicated blobs,
+    // raise an error.
+    LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
+  } else {
+    // Normal output.
+    if (layer_param) {
+      LOG(INFO) << layer_param->name() << " -> " << blob_name;
+    } else {
+      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
+    }
+    shared_ptr < Blob<Dtype> > blob_pointer(new Blob<Dtype>());
+    const int blob_id = blobs_.size();
+    blobs_.push_back(blob_pointer);
+    blob_names_.push_back(blob_name);
+    blob_need_backward_.push_back(false);
+    if (blob_name_to_idx) {
+      (*blob_name_to_idx)[blob_name] = blob_id;
+    }
+    if (layer_id == -1) {
+      // Set the (explicitly specified) dimensions of the input blob.
+      if (param.input_dim_size() > 0) {
+        blob_pointer->Reshape(param.input_dim(top_id * 4),
+            param.input_dim(top_id * 4 + 1), param.input_dim(top_id * 4 + 2),
+            param.input_dim(top_id * 4 + 3));
+      } else {
+        blob_pointer->Reshape(param.input_shape(top_id));
+      }
+      net_input_blob_indices_.push_back(blob_id);
+      net_input_blobs_.push_back(blob_pointer.get());
+    } else {
+      top_id_vecs_[layer_id].push_back(blob_id);
+      top_vecs_[layer_id].push_back(blob_pointer.get());
+    }
+  }
+  if (available_blobs) {
+    available_blobs->insert(blob_name);
+  }
 }
 
 // Helper for Net::Init: add a new bottom blob to the net.
 template <typename Dtype>
 int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
-		const int bottom_id, set<string>* available_blobs,
-		map<string, int>* blob_name_to_idx) {
-	const LayerParameter& layer_param = param.layer(layer_id);
-	const string& blob_name = layer_param.bottom(bottom_id);
-	if (available_blobs->find(blob_name) == available_blobs->end()) {
-		LOG(FATAL) << "Unknown blob input " << blob_name
-				<< " (at index " << bottom_id << ") to layer " << layer_id;
-	}
-	const int blob_id = (*blob_name_to_idx)[blob_name];
-	LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
-	bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
-	bottom_id_vecs_[layer_id].push_back(blob_id);
-	available_blobs->erase(blob_name);
-	bool propagate_down = true;
-	// Check if the backpropagation on bottom_id should be skipped
-	if (layer_param.propagate_down_size() > 0)
-		propagate_down = layer_param.propagate_down(bottom_id);
-	const bool need_backward = blob_need_backward_[blob_id] &&
-			propagate_down;
-	bottom_need_backward_[layer_id].push_back(need_backward);
-	return blob_id;
+    const int bottom_id, set<string>* available_blobs,
+    map<string, int>* blob_name_to_idx) {
+  const LayerParameter& layer_param = param.layer(layer_id);
+  const string& blob_name = layer_param.bottom(bottom_id);
+  if (available_blobs->find(blob_name) == available_blobs->end()) {
+    LOG(FATAL) << "Unknown blob input " << blob_name << " (at index "
+        << bottom_id << ") to layer " << layer_id;
+  }
+  const int blob_id = (*blob_name_to_idx)[blob_name];
+  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
+  bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
+  bottom_id_vecs_[layer_id].push_back(blob_id);
+  available_blobs->erase(blob_name);
+  bool propagate_down = true;
+  // Check if the backpropagation on bottom_id should be skipped
+  if (layer_param.propagate_down_size() > 0)
+    propagate_down = layer_param.propagate_down(bottom_id);
+  const bool need_backward = blob_need_backward_[blob_id] && propagate_down;
+  bottom_need_backward_[layer_id].push_back(need_backward);
+  return blob_id;
 }
 
 template <typename Dtype>
 void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
-		const int param_id) {
-	const LayerParameter& layer_param = layers_[layer_id]->layer_param();
-	const int param_size = layer_param.param_size();
-	string param_name =
-			(param_size > param_id) ? layer_param.param(param_id).name() : "";
-	if (param_name.size()) {
-		param_display_names_.push_back(param_name);
-	} else {
-		ostringstream param_display_name;
-		param_display_name << param_id;
-		param_display_names_.push_back(param_display_name.str());
-	}
-	const int net_param_id = params_.size();
-	params_.push_back(layers_[layer_id]->blobs()[param_id]);
-	param_id_vecs_[layer_id].push_back(net_param_id);
-	param_layer_indices_.push_back(make_pair(layer_id, param_id));
-	if (!param_size || !param_name.size() || (param_name.size() &&
-			param_names_index_.find(param_name) == param_names_index_.end())) {
-		// This layer "owns" this parameter blob -- it is either anonymous
-		// (i.e., not given a param_name) or explicitly given a name that we
-		// haven't already seen.
-		param_owners_.push_back(-1);
-		if (param_name.size()) {
-			param_names_index_[param_name] = net_param_id;
-		}
-	} else {
-		// Named param blob with name we've seen before: share params
-		const int owner_net_param_id = param_names_index_[param_name];
-		param_owners_.push_back(owner_net_param_id);
-		const pair<int, int>& owner_index =
-				param_layer_indices_[owner_net_param_id];
-		const int owner_layer_id = owner_index.first;
-		const int owner_param_id = owner_index.second;
-		LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-				<< "layer '" << layer_names_[owner_layer_id] << "', param "
-				<< "index " << owner_param_id;
-		Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get();
-		Blob < Dtype > *owner_blob =
-				layers_[owner_layer_id]->blobs()[owner_param_id].get();
-		const int param_size = layer_param.param_size();
-		if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
-				ParamSpec_DimCheckMode_PERMISSIVE)) {
-			// Permissive dimension checking -- only check counts are the same.
-			CHECK_EQ(this_blob->count(), owner_blob->count())
-					<< "Shared parameter blobs must have the same count.";
-		} else {
-			// Strict dimension checking -- all dims must be the same.
-			CHECK(this_blob->shape() == owner_blob->shape());
-		}
-		layers_[layer_id]->blobs()[param_id]->ShareData(
-				*layers_[owner_layer_id]->blobs()[owner_param_id]);
-	}
+    const int param_id) {
+  const LayerParameter& layer_param = layers_[layer_id]->layer_param();
+  const int param_size = layer_param.param_size();
+  string param_name =
+      (param_size > param_id) ? layer_param.param(param_id).name() : "";
+  if (param_name.size()) {
+    param_display_names_.push_back(param_name);
+  } else {
+    ostringstream param_display_name;
+    param_display_name << param_id;
+    param_display_names_.push_back(param_display_name.str());
+  }
+  const int net_param_id = params_.size();
+  params_.push_back(layers_[layer_id]->blobs()[param_id]);
+  param_id_vecs_[layer_id].push_back(net_param_id);
+  param_layer_indices_.push_back(make_pair(layer_id, param_id));
+  if (!param_size || !param_name.size()
+      || (param_name.size()
+          && param_names_index_.find(param_name) == param_names_index_.end())) {
+    // This layer "owns" this parameter blob -- it is either anonymous
+    // (i.e., not given a param_name) or explicitly given a name that we
+    // haven't already seen.
+    param_owners_.push_back(-1);
+    if (param_name.size()) {
+      param_names_index_[param_name] = net_param_id;
+    }
+  } else {
+    // Named param blob with name we've seen before: share params
+    const int owner_net_param_id = param_names_index_[param_name];
+    param_owners_.push_back(owner_net_param_id);
+    const pair<int, int>& owner_index = param_layer_indices_[owner_net_param_id];
+    const int owner_layer_id = owner_index.first;
+    const int owner_param_id = owner_index.second;
+    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
+        << "layer '" << layer_names_[owner_layer_id] << "', param " << "index "
+        << owner_param_id;
+    Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get();
+    Blob < Dtype > *owner_blob =
+        layers_[owner_layer_id]->blobs()[owner_param_id].get();
+    const int param_size = layer_param.param_size();
+    if (param_size > param_id
+        && (layer_param.param(param_id).share_mode()
+            == ParamSpec_DimCheckMode_PERMISSIVE)) {
+      // Permissive dimension checking -- only check counts are the same.
+      CHECK_EQ(this_blob->count(), owner_blob->count())
+          << "Shared parameter blobs must have the same count.";
+    } else {
+      // Strict dimension checking -- all dims must be the same.
+      CHECK(this_blob->shape() == owner_blob->shape());
+    }
+    layers_[layer_id]->blobs()[param_id]->ShareData(
+        *layers_[owner_layer_id]->blobs()[owner_param_id]);
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::GetLearningRateAndWeightDecay() {
-	LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
-	ParamSpec default_param_spec;
-	for (int i = 0; i < layers_.size(); ++i) {
-		vector < shared_ptr<Blob<Dtype> > > &layer_blobs = layers_[i]->blobs();
-		for (int j = 0; j < layer_blobs.size(); ++j) {
-			const ParamSpec* param_spec =
-					(layers_[i]->layer_param().param_size() > j) ?
-							&layers_[i]->layer_param().param(j) : &default_param_spec;
-			params_lr_.push_back(param_spec->lr_mult());
-			params_weight_decay_.push_back(param_spec->decay_mult());
-		}
-	}
+  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
+  ParamSpec default_param_spec;
+  for (int i = 0; i < layers_.size(); ++i) {
+    vector < shared_ptr<Blob<Dtype> > > &layer_blobs = layers_[i]->blobs();
+    for (int j = 0; j < layer_blobs.size(); ++j) {
+      const ParamSpec* param_spec =
+          (layers_[i]->layer_param().param_size() > j) ?
+              &layers_[i]->layer_param().param(j) : &default_param_spec;
+      params_lr_.push_back(param_spec->lr_mult());
+      params_weight_decay_.push_back(param_spec->decay_mult());
+    }
+  }
 }
 
 template <typename Dtype>
 Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
-	CHECK_GE(start, 0);
-	CHECK_LT(end, layers_.size());
-	Dtype loss = 0;
-	if (debug_info_) {
-		for (int i = 0; i < net_input_blobs_.size(); ++i) {
-			InputDebugInfo(i);
-		}
-	}
-
-	CPUTimer forward_timer;
-	CPUTimer layer_timer;
-	forward_timer.Start();
-
-	for (int i = start; i <= end; ++i) {
-		layer_timer.Start();
-		Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
-		loss += layer_loss;
-		if (debug_info_) {
-			ForwardDebugInfo(i);
-		}
-		clFinish(amdDevice.CommandQueue);
-		layer_timer.Stop();
-		printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
-				layer_timer.MilliSeconds());
-	}
-
-	forward_timer.Stop();
-	printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
-
-	return loss;
+  CHECK_GE(start, 0);
+  CHECK_LT(end, layers_.size());
+  Dtype loss = 0;
+  if (debug_info_) {
+    for (int i = 0; i < net_input_blobs_.size(); ++i) {
+      InputDebugInfo(i);
+    }
+  }
+
+  CPUTimer forward_timer;
+  CPUTimer layer_timer;
+  forward_timer.Start();
+
+  for (int i = start; i <= end; ++i) {
+    layer_timer.Start();
+    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
+    loss += layer_loss;
+    if (debug_info_) {
+      ForwardDebugInfo(i);
+    }
+    clFinish(amdDevice.CommandQueue);
+    layer_timer.Stop();
+    printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+        layer_timer.MilliSeconds());
+  }
+
+  forward_timer.Stop();
+  printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
+
+  return loss;
 }
 
 template <typename Dtype>
 Dtype Net<Dtype>::ForwardFrom(int start) {
-	return ForwardFromTo(start, layers_.size() - 1);
+  return ForwardFromTo(start, layers_.size() - 1);
 }
 
 template <typename Dtype>
 Dtype Net<Dtype>::ForwardTo(int end) {
-	return ForwardFromTo(0, end);
+  return ForwardFromTo(0, end);
 }
 
 template <typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::ForwardPrefilled(Dtype* loss) {
-	if (loss != NULL) {
-		*loss = ForwardFromTo(0, layers_.size() - 1);
-	} else {
-		ForwardFromTo(0, layers_.size() - 1);
-	}
-	return net_output_blobs_;
+  if (loss != NULL) {
+    *loss = ForwardFromTo(0, layers_.size() - 1);
+  } else {
+    ForwardFromTo(0, layers_.size() - 1);
+  }
+  return net_output_blobs_;
 }
 
 template <typename Dtype>
 const vector<Blob<Dtype>*>& Net<Dtype>::Forward(
-		const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
-	// Copy bottom to internal bottom
-	for (int i = 0; i < bottom.size(); ++i) {
-		net_input_blobs_[i]->CopyFrom(*bottom[i]);
-	}
-	return ForwardPrefilled(loss);
+    const vector<Blob<Dtype>*> & bottom, Dtype* loss) {
+  // Copy bottom to internal bottom
+  for (int i = 0; i < bottom.size(); ++i) {
+    net_input_blobs_[i]->CopyFrom(*bottom[i]);
+  }
+  return ForwardPrefilled(loss);
 }
 
 template <typename Dtype>
 string Net<Dtype>::Forward(const string& input_blob_protos, Dtype* loss) {
-	BlobProtoVector blob_proto_vec;
-	if (net_input_blobs_.size()) {
-		blob_proto_vec.ParseFromString(input_blob_protos);
-		CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
-				<< "Incorrect input size.";
-		for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
-			net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
-		}
-	}
-	ForwardPrefilled(loss);
-	blob_proto_vec.Clear();
-	for (int i = 0; i < net_output_blobs_.size(); ++i) {
-		net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
-	}
-	string output;
-	blob_proto_vec.SerializeToString(&output);
-	return output;
+  BlobProtoVector blob_proto_vec;
+  if (net_input_blobs_.size()) {
+    blob_proto_vec.ParseFromString(input_blob_protos);
+    CHECK_EQ(blob_proto_vec.blobs_size(), net_input_blobs_.size())
+        << "Incorrect input size.";
+    for (int i = 0; i < blob_proto_vec.blobs_size(); ++i) {
+      net_input_blobs_[i]->FromProto(blob_proto_vec.blobs(i));
+    }
+  }
+  ForwardPrefilled(loss);
+  blob_proto_vec.Clear();
+  for (int i = 0; i < net_output_blobs_.size(); ++i) {
+    net_output_blobs_[i]->ToProto(blob_proto_vec.add_blobs());
+  }
+  string output;
+  blob_proto_vec.SerializeToString(&output);
+  return output;
 }
 
 template <typename Dtype>
 void Net<Dtype>::BackwardFromTo(int start, int end) {
-	CHECK_GE(end, 0);
-	CHECK_LT(start, layers_.size());
+  CHECK_GE(end, 0);
+  CHECK_LT(start, layers_.size());
 
-	CPUTimer backward_timer;
-	CPUTimer layer_timer;
-	backward_timer.Start();
+  CPUTimer backward_timer;
+  CPUTimer layer_timer;
+  backward_timer.Start();
 
-	for (int i = start; i >= end; --i) {
-		layer_timer.Start();
-		if (layer_need_backward_[i]) {
-			layers_[i]->Backward(
-					top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
-			if (debug_info_) {
-				BackwardDebugInfo(i);
-			}
-			clFinish(amdDevice.CommandQueue);
-			layer_timer.Start();
-			printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
-					layer_timer.MilliSeconds());
-		}
-	}
+  for (int i = start; i >= end; --i) {
+    layer_timer.Start();
+    if (layer_need_backward_[i]) {
+      layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i],
+          bottom_vecs_[i]);
+      if (debug_info_) {
+        BackwardDebugInfo(i);
+      }
+      clFinish(amdDevice.CommandQueue);
+      layer_timer.Start();
+      printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+          layer_timer.MilliSeconds());
+    }
+  }
 
-	backward_timer.Stop();
-	printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
+  backward_timer.Stop();
+  printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
 }
 
 template <typename Dtype>
 void Net<Dtype>::InputDebugInfo(const int input_id) {
-	const Blob<Dtype>& blob = *net_input_blobs_[input_id];
-	const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
-	const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-	LOG(INFO) << "    [Forward] "
-			<< "Input " << blob_name << " data: " << data_abs_val_mean;
+  const Blob<Dtype>& blob = *net_input_blobs_[input_id];
+  const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
+  const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+  LOG(INFO) << "    [Forward] " << "Input " << blob_name << " data: "
+      << data_abs_val_mean;
 }
 
 template <typename Dtype>
 void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
-	for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
-		const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
-		const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-		LOG(INFO) << "    [Forward] "
-				<< "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-				<< " data: " << data_abs_val_mean;
-	}
-	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-			++param_id) {
-		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-		const int net_param_id = param_id_vecs_[layer_id][param_id];
-		const string& blob_name = param_display_names_[net_param_id];
-		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-		LOG(INFO) << "    [Forward] "
-				<< "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-				<< " data: " << data_abs_val_mean;
-	}
+  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
+    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Forward] " << "Layer " << layer_names_[layer_id]
+        << ", top blob " << blob_name << " data: " << data_abs_val_mean;
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+      ++param_id) {
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const int net_param_id = param_id_vecs_[layer_id][param_id];
+    const string& blob_name = param_display_names_[net_param_id];
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Forward] " << "Layer " << layer_names_[layer_id]
+        << ", param blob " << blob_name << " data: " << data_abs_val_mean;
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
-	const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
-	for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
-		if (!bottom_need_backward_[layer_id][bottom_id]) {
-			continue;
-		}
-		const Blob<Dtype>& blob = *bottom_vec[bottom_id];
-		const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-		LOG(INFO) << "    [Backward] "
-				<< "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-				<< " diff: " << diff_abs_val_mean;
-	}
-	for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-			++param_id) {
-		if (!layers_[layer_id]->param_propagate_down(param_id)) {
-			continue;
-		}
-		const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-		const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-		LOG(INFO) << "    [Backward] "
-				<< "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-				<< " diff: " << diff_abs_val_mean;
-	}
+  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
+  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
+    if (!bottom_need_backward_[layer_id][bottom_id]) {
+      continue;
+    }
+    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
+    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Backward] " << "Layer " << layer_names_[layer_id]
+        << ", bottom blob " << blob_name << " diff: " << diff_abs_val_mean;
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+      ++param_id) {
+    if (!layers_[layer_id]->param_propagate_down(param_id)) {
+      continue;
+    }
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+    LOG(INFO) << "    [Backward] " << "Layer " << layer_names_[layer_id]
+        << ", param blob " << param_id << " diff: " << diff_abs_val_mean;
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::UpdateDebugInfo(const int param_id) {
-	const Blob<Dtype>& blob = *params_[param_id];
-	const int param_owner = param_owners_[param_id];
-	const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
-	const string& param_display_name = param_display_names_[param_id];
-	const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-	if (param_owner < 0) {
-		const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-		LOG(INFO) << "    [Update] Layer " << layer_name
-				<< ", param " << param_display_name
-				<< " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
-	} else {
-		const string& owner_layer_name =
-				layer_names_[param_layer_indices_[param_owner].first];
-		LOG(INFO) << "    [Update] Layer " << layer_name
-				<< ", param blob " << param_display_name
-				<< " (owned by layer " << owner_layer_name << ", "
-				<< "param " << param_display_names_[param_owners_[param_id]] << ")"
-				<< " diff: " << diff_abs_val_mean;
-	}
+  const Blob<Dtype>& blob = *params_[param_id];
+  const int param_owner = param_owners_[param_id];
+  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
+  const string& param_display_name = param_display_names_[param_id];
+  const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
+  if (param_owner < 0) {
+    const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
+    LOG(INFO) << "    [Update] Layer " << layer_name << ", param "
+        << param_display_name << " data: " << data_abs_val_mean << "; diff: "
+        << diff_abs_val_mean;
+  } else {
+    const string& owner_layer_name =
+        layer_names_[param_layer_indices_[param_owner].first];
+    LOG(INFO) << "    [Update] Layer " << layer_name << ", param blob "
+        << param_display_name << " (owned by layer " << owner_layer_name << ", "
+        << "param " << param_display_names_[param_owners_[param_id]] << ")"
+        << " diff: " << diff_abs_val_mean;
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
-	int num_source_layers = other->layers().size();
-	for (int i = 0; i < num_source_layers; ++i) {
-		Layer < Dtype > *source_layer = other->layers()[i].get();
-		const string& source_layer_name = other->layer_names()[i];
-		int target_layer_id = 0;
-		while (target_layer_id != layer_names_.size() &&
-				layer_names_[target_layer_id] != source_layer_name) {
-			++target_layer_id;
-		}
-		if (target_layer_id == layer_names_.size()) {
-			DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-			continue;
-		}
-		DLOG(INFO) << "Copying source layer " << source_layer_name;
-		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
-				layers_[target_layer_id]->blobs();
-		CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
-				<< "Incompatible number of blobs for layer " << source_layer_name;
-		for (int j = 0; j < target_blobs.size(); ++j) {
-			Blob < Dtype > *source_blob = source_layer->blobs()[j].get();
-			CHECK(target_blobs[j]->shape() == source_blob->shape());
-			target_blobs[j]->ShareData(*source_blob);
-		}
-	}
+  int num_source_layers = other->layers().size();
+  for (int i = 0; i < num_source_layers; ++i) {
+    Layer < Dtype > *source_layer = other->layers()[i].get();
+    const string& source_layer_name = other->layer_names()[i];
+    int target_layer_id = 0;
+    while (target_layer_id != layer_names_.size()
+        && layer_names_[target_layer_id] != source_layer_name) {
+      ++target_layer_id;
+    }
+    if (target_layer_id == layer_names_.size()) {
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+      continue;
+    }
+    DLOG(INFO) << "Copying source layer " << source_layer_name;
+    vector < shared_ptr<Blob<Dtype> > > &target_blobs =
+        layers_[target_layer_id]->blobs();
+    CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
+        << "Incompatible number of blobs for layer " << source_layer_name;
+    for (int j = 0; j < target_blobs.size(); ++j) {
+      Blob < Dtype > *source_blob = source_layer->blobs()[j].get();
+      CHECK(target_blobs[j]->shape() == source_blob->shape());
+      target_blobs[j]->ShareData(*source_blob);
+    }
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::BackwardFrom(int start) {
-	BackwardFromTo(start, 0);
+  BackwardFromTo(start, 0);
 }
 
 template <typename Dtype>
 void Net<Dtype>::BackwardTo(int end) {
-	BackwardFromTo(layers_.size() - 1, end);
+  BackwardFromTo(layers_.size() - 1, end);
 }
 
 template <typename Dtype>
 void Net<Dtype>::Backward() {
-	BackwardFromTo(layers_.size() - 1, 0);
-	if (debug_info_) {
-		Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
-		for (int i = 0; i < params_.size(); ++i) {
-			if (param_owners_[i] >= 0) {
-				continue;
-			}
-			asum_data += params_[i]->asum_data();
-			asum_diff += params_[i]->asum_diff();
-			sumsq_data += params_[i]->sumsq_data();
-			sumsq_diff += params_[i]->sumsq_diff();
-		}
-		const Dtype l2norm_data = std::sqrt(sumsq_data);
-		const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-		LOG(ERROR) << "    [Backward] All net params (data, diff): "
-				<< "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-				<< "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
-	}
+  BackwardFromTo(layers_.size() - 1, 0);
+  if (debug_info_) {
+    Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
+    for (int i = 0; i < params_.size(); ++i) {
+      if (param_owners_[i] >= 0) {
+        continue;
+      }
+      asum_data += params_[i]->asum_data();
+      asum_diff += params_[i]->asum_diff();
+      sumsq_data += params_[i]->sumsq_data();
+      sumsq_diff += params_[i]->sumsq_diff();
+    }
+    const Dtype l2norm_data = std::sqrt(sumsq_data);
+    const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+    LOG(ERROR) << "    [Backward] All net params (data, diff): "
+        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::Reshape() {
-	for (int i = 0; i < layers_.size(); ++i) {
-		layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
-	}
+  for (int i = 0; i < layers_.size(); ++i) {
+    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
-	int num_source_layers = param.layer_size();
-	for (int i = 0; i < num_source_layers; ++i) {
-		const LayerParameter& source_layer = param.layer(i);
-		const string& source_layer_name = source_layer.name();
-		int target_layer_id = 0;
-		while (target_layer_id != layer_names_.size() &&
-				layer_names_[target_layer_id] != source_layer_name) {
-			++target_layer_id;
-		}
-		if (target_layer_id == layer_names_.size()) {
-			DLOG(INFO) << "Ignoring source layer " << source_layer_name;
-			continue;
-		}
-		DLOG(INFO) << "Copying source layer " << source_layer_name;
-		vector < shared_ptr<Blob<Dtype> > > &target_blobs =
-				layers_[target_layer_id]->blobs();
-		CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
-				<< "Incompatible number of blobs for layer " << source_layer_name;
-		for (int j = 0; j < target_blobs.size(); ++j) {
-			const bool kReshape = false;
-			target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
-		}
-	}
+  int num_source_layers = param.layer_size();
+  for (int i = 0; i < num_source_layers; ++i) {
+    const LayerParameter& source_layer = param.layer(i);
+    const string& source_layer_name = source_layer.name();
+    int target_layer_id = 0;
+    while (target_layer_id != layer_names_.size()
+        && layer_names_[target_layer_id] != source_layer_name) {
+      ++target_layer_id;
+    }
+    if (target_layer_id == layer_names_.size()) {
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+      continue;
+    }
+    DLOG(INFO) << "Copying source layer " << source_layer_name;
+    vector < shared_ptr<Blob<Dtype> > > &target_blobs =
+        layers_[target_layer_id]->blobs();
+    CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
+        << "Incompatible number of blobs for layer " << source_layer_name;
+    for (int j = 0; j < target_blobs.size(); ++j) {
+      const bool kReshape = false;
+      target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
+    }
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-	NetParameter param;
-	ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
-	CopyTrainedLayersFrom(param);
+  NetParameter param;
+  ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
+  CopyTrainedLayersFrom(param);
 }
 
 template <typename Dtype>
 void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
-	param->Clear();
-	param->set_name(name_);
-	// Add bottom and top
-	for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
-		param->add_input(blob_names_[net_input_blob_indices_[i]]);
-	}
-	DLOG(INFO) << "Serializing " << layers_.size() << " layers";
-	for (int i = 0; i < layers_.size(); ++i) {
-		LayerParameter* layer_param = param->add_layer();
-		for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
-			layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
-		}
-		for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
-			layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
-		}
-		layers_[i]->ToProto(layer_param, write_diff);
-	}
+  param->Clear();
+  param->set_name(name_);
+  // Add bottom and top
+  for (int i = 0; i < net_input_blob_indices_.size(); ++i) {
+    param->add_input(blob_names_[net_input_blob_indices_[i]]);
+  }
+  DLOG(INFO) << "Serializing " << layers_.size() << " layers";
+  for (int i = 0; i < layers_.size(); ++i) {
+    LayerParameter* layer_param = param->add_layer();
+    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
+      layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
+    }
+    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
+      layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
+    }
+    layers_[i]->ToProto(layer_param, write_diff);
+  }
 }
 
 template <typename Dtype>
 void Net<Dtype>::Update() {
-	// First, accumulate the diffs of any shared parameters into their owner's
-	// diff. (Assumes that the learning rate, weight decay, etc. have already been
-	// accounted for in the current diff.)
-	for (int i = 0; i < params_.size(); ++i) {
-		if (param_owners_[i] < 0) {
-			continue;
-		}
-		if (debug_info_) {
-			UpdateDebugInfo(i);
-		}
-		const int count = params_[i]->count();
-		const Dtype* this_diff;
-		Dtype* owner_diff;
-		this_diff = params_[i]->cpu_diff();
-		owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-
-		switch (Caffe::mode()) {
-			case Caffe::CPU:
-				this_diff = params_[i]->cpu_diff();
-				owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-				caffe_add(count, this_diff, owner_diff, owner_diff);
-				break;
-			case Caffe::GPU:
-				#ifndef CPU_ONLY
-				this_diff = params_[i]->gpu_diff();
-				owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-				// caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-				caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff);
+  // First, accumulate the diffs of any shared parameters into their owner's
+  // diff. (Assumes that the learning rate, weight decay, etc. have already been
+  // accounted for in the current diff.)
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] < 0) {
+      continue;
+    }
+    if (debug_info_) {
+      UpdateDebugInfo(i);
+    }
+    const int count = params_[i]->count();
+    const Dtype* this_diff;
+    Dtype* owner_diff;
+    this_diff = params_[i]->cpu_diff();
+    owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+
+    switch (Caffe::mode()) {
+    case Caffe::CPU:
+      this_diff = params_[i]->cpu_diff();
+      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+      caffe_add(count, this_diff, owner_diff, owner_diff);
+      break;
+    case Caffe::GPU:
+#ifndef CPU_ONLY
+      this_diff = params_[i]->gpu_diff();
+      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
+      // caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+      caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff);
 #else
-				NO_GPU;
+      NO_GPU;
 #endif
-				break;
-			default:
-				LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-		}
-	}
-	// Now, update the owned parameters.
-	for (int i = 0; i < params_.size(); ++i) {
-		if (param_owners_[i] >= 0) {
-			continue;
-		}
-		if (debug_info_) {
-			UpdateDebugInfo(i);
-		}
-		params_[i]->Update();
-	}
+      break;
+    default:
+      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+    }
+  }
+  // Now, update the owned parameters.
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] >= 0) {
+      continue;
+    }
+    if (debug_info_) {
+      UpdateDebugInfo(i);
+    }
+    params_[i]->Update();
+  }
 }
 
 template <typename Dtype>
 bool Net<Dtype>::has_blob(const string& blob_name) const {
-	return blob_names_index_.find(blob_name) != blob_names_index_.end();
+  return blob_names_index_.find(blob_name) != blob_names_index_.end();
 }
 
 template <typename Dtype>
 const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
-		const string& blob_name) const {
-	shared_ptr < Blob<Dtype> > blob_ptr;
-	if (has_blob(blob_name)) {
-		blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
-	} else {
-		blob_ptr.reset((Blob<Dtype>*) (NULL));
-		LOG(WARNING) << "Unknown blob name " << blob_name;
-	}
-	return blob_ptr;
+    const string& blob_name) const {
+  shared_ptr < Blob<Dtype> > blob_ptr;
+  if (has_blob(blob_name)) {
+    blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
+  } else {
+    blob_ptr.reset((Blob<Dtype>*) (NULL));
+    LOG(WARNING) << "Unknown blob name " << blob_name;
+  }
+  return blob_ptr;
 }
 
 template <typename Dtype>
 bool Net<Dtype>::has_layer(const string& layer_name) const {
-	return layer_names_index_.find(layer_name) != layer_names_index_.end();
+  return layer_names_index_.find(layer_name) != layer_names_index_.end();
 }
 
 template <typename Dtype>
 const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
-		const string& layer_name) const {
-	shared_ptr < Layer<Dtype> > layer_ptr;
-	if (has_layer(layer_name)) {
-		layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
-	} else {
-		layer_ptr.reset((Layer<Dtype>*) (NULL));
-		LOG(WARNING) << "Unknown layer name " << layer_name;
-	}
-	return layer_ptr;
+    const string& layer_name) const {
+  shared_ptr < Layer<Dtype> > layer_ptr;
+  if (has_layer(layer_name)) {
+    layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
+  } else {
+    layer_ptr.reset((Layer<Dtype>*) (NULL));
+    LOG(WARNING) << "Unknown layer name " << layer_name;
+  }
+  return layer_ptr;
 }
 
 INSTANTIATE_CLASS (Net);
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
index f23ff9a3..99d04575 100644
--- a/src/caffe/ocl/bnll_layer.cl
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -28,25 +28,25 @@
 
 template <class T>
 __kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
-	int index = get_global_id(0);
-	if (index < n) {
-		out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
-	}
+  int index = get_global_id(0);
+  if (index < n) {
+    out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
+  }
 }
 template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out);
 template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out);
 
 template <class T>
 __kernel void BNLLBackward(const int n, __global const T* in_diff,
-		__global const T* in_data, __global T* out_diff) {
-	int index = get_global_id(0);
-	if (index < n) {
-		T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
-		out_diff[index] = in_diff[index] * expval / (expval + 1.);
-	}
+    __global const T* in_data, __global T* out_diff) {
+  int index = get_global_id(0);
+  if (index < n) {
+    T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
+    out_diff[index] = in_diff[index] * expval / (expval + 1.);
+  }
 }
 
 template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff,
-		__global const float* in_data, __global float* out_diff);
+    __global const float* in_data, __global float* out_diff);
 template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff,
-		__global const double* in_data, __global double* out_diff);
+    __global const double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
index ba5e1f54..a9663fce 100644
--- a/src/caffe/ocl/concat_layer.cl
+++ b/src/caffe/ocl/concat_layer.cl
@@ -26,29 +26,29 @@
 
 template <class T>
 __kernel void Concat(const int nthreads, __global const T* in_data,
-		const int forward, const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, __global T* out_data) {
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		const int total_concat_size = concat_size * bottom_concat_axis;
-		const int concat_num = index / total_concat_size;
-		const int concat_index = index % total_concat_size;
-		const int top_index = concat_index +
-		(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-		if (forward == 1) {
-			out_data[top_index] = in_data[index];
-		} else {
-			out_data[index] = in_data[top_index];
-		}
-	}
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global T* out_data) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index = concat_index +
+    (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    if (forward == 1) {
+      out_data[top_index] = in_data[index];
+    } else {
+      out_data[index] = in_data[top_index];
+    }
+  }
 }
 
 template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data,
-		const int forward, const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, __global float* out_data);
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global float* out_data);
 template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data,
-		const int forward, const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, __global double* out_data);
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
index b6fdebc7..477f2ff4 100644
--- a/src/caffe/ocl/contrastive_loss_layer.cl
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -26,39 +26,39 @@
 
 template <class Dtype>
 __kernel void CLLBackward(const int count, const int channels,
-		const Dtype margin, const bool legacy_version, const Dtype alpha,
-		__global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
-		__global Dtype *bottom_diff) {
-	int i = get_global_id(0);
-	if(i < count) {
-		int n = i / channels;  // the num index, to access y and dist_sq
-		if (static_cast<int>(y[n])) {  // similar pairs
-			bottom_diff[i] = alpha * diff[i];
-		} else {  // dissimilar pairs
-			Dtype mdist(0.0);
-			Dtype beta(0.0);
-			if (legacy_version) {
-				mdist = (margin - dist_sq[n]);
-				beta = -alpha;
-			} else {
-				Dtype dist = sqrt(dist_sq[n]);
-				mdist = (margin - dist);
-				beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-			}
-			if (mdist > 0.0) {
-				bottom_diff[i] = beta;
-			} else {
-				bottom_diff[i] = 0;
-			}
-		}
-	}
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
+    __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
+    __global Dtype *bottom_diff) {
+  int i = get_global_id(0);
+  if(i < count) {
+    int n = i / channels;  // the num index, to access y and dist_sq
+    if (static_cast<int>(y[n])) {  // similar pairs
+      bottom_diff[i] = alpha * diff[i];
+    } else {  // dissimilar pairs
+      Dtype mdist(0.0);
+      Dtype beta(0.0);
+      if (legacy_version) {
+        mdist = (margin - dist_sq[n]);
+        beta = -alpha;
+      } else {
+        Dtype dist = sqrt(dist_sq[n]);
+        mdist = (margin - dist);
+        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+      }
+      if (mdist > 0.0) {
+        bottom_diff[i] = beta;
+      } else {
+        bottom_diff[i] = 0;
+      }
+    }
+  }
 }
 
 template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels,
-		const float margin, const bool legacy_version, const float alpha,
-		__global const float* y, __global const float* diff, __global const float* dist_sq,
-		__global float *bottom_diff);
+    const float margin, const bool legacy_version, const float alpha,
+    __global const float* y, __global const float* diff, __global const float* dist_sq,
+    __global float *bottom_diff);
 template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels,
-		const double margin, const bool legacy_version, const double alpha,
-		__global const double* y, __global const double* diff, __global const double* dist_sq,
-		__global double *bottom_diff);
+    const double margin, const bool legacy_version, const double alpha,
+    __global const double* y, __global const double* diff, __global const double* dist_sq,
+    __global double *bottom_diff);
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
index bb2fc696..230c9715 100644
--- a/src/caffe/ocl/dropout_layer.cl
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -26,18 +26,18 @@
 
 template <class T>
 __kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) {
-	int index = get_global_id(0);
-	if (index < n)
-	out[index] = in[index] * scale * mask[index];
+  int index = get_global_id(0);
+  if (index < n)
+  out[index] = in[index] * scale * mask[index];
 }
 template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out);
 template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
 
 template <class T>
 __kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) {
-	int index = get_global_id(0);
-	if (index < n)
-	out_diff[index] = in_diff[index] * scale * mask[index];
+  int index = get_global_id(0);
+  if (index < n)
+  out_diff[index] = in_diff[index] * scale * mask[index];
 }
 template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff);
 template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
index 3f60a34f..88137dd7 100644
--- a/src/caffe/ocl/eltwise_layer.cl
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -26,48 +26,48 @@
 
 template <class Dtype>
 __kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
-		__global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
-		__global int* mask) {
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		Dtype maxval = -FLT_MAX;
-		int maxidx = -1;
-		if (bottom_data_a[index] > bottom_data_b[index]) {
-			// only update for very first bottom_data blob (blob_idx == 0)
-			if (blob_idx == 0) {
-				maxval = bottom_data_a[index];
-				top_data[index] = maxval;
-				maxidx = blob_idx;
-				mask[index] = maxidx;
-			}
-		} else {
-			maxval = bottom_data_b[index];
-			top_data[index] = maxval;
-			maxidx = blob_idx + 1;
-			mask[index] = maxidx;
-		}
-	}
+    __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
+    __global int* mask) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    Dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    if (bottom_data_a[index] > bottom_data_b[index]) {
+      // only update for very first bottom_data blob (blob_idx == 0)
+      if (blob_idx == 0) {
+        maxval = bottom_data_a[index];
+        top_data[index] = maxval;
+        maxidx = blob_idx;
+        mask[index] = maxidx;
+      }
+    } else {
+      maxval = bottom_data_b[index];
+      top_data[index] = maxval;
+      maxidx = blob_idx + 1;
+      mask[index] = maxidx;
+    }
+  }
 }
 template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a,
-		__global const float* bottom_data_b, const int blob_idx, __global float* top_data,
-		__global int* mask);
+    __global const float* bottom_data_b, const int blob_idx, __global float* top_data,
+    __global int* mask);
 template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a,
-		__global const double* bottom_data_b, const int blob_idx, __global double* top_data,
-		__global int* mask);
+    __global const double* bottom_data_b, const int blob_idx, __global double* top_data,
+    __global int* mask);
 
 template <class Dtype>
 __kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
-		const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		Dtype gradient = 0;
-		if (mask[index] == blob_idx) {
-			gradient += top_diff[index];
-		}
-		bottom_diff[index] = gradient;
-	}
+    const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    Dtype gradient = 0;
+    if (mask[index] == blob_idx) {
+      gradient += top_diff[index];
+    }
+    bottom_diff[index] = gradient;
+  }
 }
 template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff,
-		const int blob_idx, __global const int* mask, __global float* bottom_diff);
+    const int blob_idx, __global const int* mask, __global float* bottom_diff);
 template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff,
-		const int blob_idx, __global const int* mask, __global double* bottom_diff);
+    const int blob_idx, __global const int* mask, __global double* bottom_diff);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 46248024..09f240cf 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -26,31 +26,31 @@
 
 template <class T>
 __kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) {
-	int index=get_global_id(0);
-	data_im = data_im + img_offset;
-	data_col = data_col + col_offset;
-	if(index < n) {
-		int w_out=index %width_col;
-		index /= width_col;
-		int h_out=index%height_col;
-		int channel_in = index/height_col;
-		int channel_out=channel_in *ksize *ksize;
-		int h_in = h_out *stride-pad;
-		int w_in = w_out *stride-pad;
-		data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-		data_im +=(channel_in * height + h_in) *width + w_in;
-		int i=0,j=0;
-		for(i=0;i<ksize;++i) {
-			for(j=0;j<ksize;++j) {
-				int h = h_in+i;
-				int w = w_in+j;
-				if(h >= 0 && w >= 0 && h < height && w < width)
-				*data_col=data_im[i * width + j];
-				else *data_col=0;
-				data_col +=height_col *width_col;
-			}
-		}
-	}
+  int index=get_global_id(0);
+  data_im = data_im + img_offset;
+  data_col = data_col + col_offset;
+  if(index < n) {
+    int w_out=index %width_col;
+    index /= width_col;
+    int h_out=index%height_col;
+    int channel_in = index/height_col;
+    int channel_out=channel_in *ksize *ksize;
+    int h_in = h_out *stride-pad;
+    int w_in = w_out *stride-pad;
+    data_col +=(channel_out *height_col + h_out) *width_col + w_out;
+    data_im +=(channel_in * height + h_in) *width + w_in;
+    int i=0,j=0;
+    for(i=0;i<ksize;++i) {
+      for(j=0;j<ksize;++j) {
+        int h = h_in+i;
+        int w = w_in+j;
+        if(h >= 0 && w >= 0 && h < height && w < width)
+        *data_col=data_im[i * width + j];
+        else *data_col=0;
+        data_col +=height_col *width_col;
+      }
+    }
+  }
 }
 
 template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset);
@@ -59,34 +59,34 @@ template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const
 template <class T>
 __kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
 
-	int index = get_global_id(0);
+  int index = get_global_id(0);
 
-	data_im = data_im + img_offset;
-	data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  data_col = data_col + col_offset;
 
-	int x_out = index % width_col;
-	int y_out = (index / width_col) % height_col;
-	int channel_in = (index / width_col / height_col) % channels;
-	int channel_out = channel_in * ksize * ksize;
-	int im_id = index / width_col / height_col / channels;
+  int x_out = index % width_col;
+  int y_out = (index / width_col) % height_col;
+  int channel_in = (index / width_col / height_col) % channels;
+  int channel_out = channel_in * ksize * ksize;
+  int im_id = index / width_col / height_col / channels;
 
-	int y_in = y_out * stride - pad;
-	int x_in = x_out * stride - pad;
-	int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
-	int offset_im = im_id * channels * height * width + channel_in * height * width;
+  int y_in = y_out * stride - pad;
+  int x_in = x_out * stride - pad;
+  int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
+  int offset_im = im_id * channels * height * width + channel_in * height * width;
 
-	for(int k_h = 0; k_h < ksize; k_h++) {
-		for(int k_w = 0; k_w < ksize; k_w++) {
-			int x_im = x_in + k_w;
-			int y_im = y_in + k_h;
-			int index_im = y_im * width + x_im;
-			int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
-			if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
-			data_col[offset_col + index_col] = data_im[offset_im + index_im];
-			else
-			data_col[offset_col + index_col] = 0;
-		}
-	}
+  for(int k_h = 0; k_h < ksize; k_h++) {
+    for(int k_w = 0; k_w < ksize; k_w++) {
+      int x_im = x_in + k_w;
+      int y_im = y_in + k_h;
+      int index_im = y_im * width + x_im;
+      int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+      if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
+      data_col[offset_col + index_col] = data_im[offset_im + index_im];
+      else
+      data_col[offset_col + index_col] = 0;
+    }
+  }
 }
 
 template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum);
@@ -94,150 +94,150 @@ template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_o
 
 template <class T>
 __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		const int height_col, const int width_col,
-		__global T* data_col, const int col_offset) {
-	data_im = data_im + img_offset;
-	data_col = data_col + col_offset;
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_col, const int col_offset) {
+  data_im = data_im + img_offset;
+  data_col = data_col + col_offset;
 
-	int index = get_global_id(0);
-	if(index < n) {
-		int w_out = index % width_col;
-		int h_index = index / width_col;
-		int h_out = h_index % height_col;
-		int channel_in = h_index / height_col;
-		int channel_out = channel_in * kernel_h * kernel_w;
-		int h_in = h_out * stride_h - pad_h;
-		int w_in = w_out * stride_w - pad_w;
-		__global T* data_col_ptr = data_col;
-		data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-		__global const T* data_im_ptr = data_im;
-		data_im_ptr += (channel_in * height + h_in) * width + w_in;
-		for (int i = 0; i < kernel_h; ++i) {
-			for (int j = 0; j < kernel_w; ++j) {
-				int h = h_in + i;
-				int w = w_in + j;
-				*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-				data_im_ptr[i * width + j] : 0;
-				data_col_ptr += height_col * width_col;
-			}
-		}
-	}
+  int index = get_global_id(0);
+  if(index < n) {
+    int w_out = index % width_col;
+    int h_index = index / width_col;
+    int h_out = h_index % height_col;
+    int channel_in = h_index / height_col;
+    int channel_out = channel_in * kernel_h * kernel_w;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+    __global T* data_col_ptr = data_col;
+    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    __global const T* data_im_ptr = data_im;
+    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h = h_in + i;
+        int w = w_in + j;
+        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+        data_im_ptr[i * width + j] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
 }
 
 template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
-		const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		const int height_col, const int width_col, __global float* data_col, const int col_offset);
+    const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int height_col, const int width_col, __global float* data_col, const int col_offset);
 template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
-		const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		const int height_col, const int width_col, __global double* data_col, const int col_offset);
+    const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int height_col, const int width_col, __global double* data_col, const int col_offset);
 
 template <class T>
 __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		const int height_col, const int width_col,
-		__global T* data_im, const int img_offset) {
-	data_col = data_col + col_offset;
-	data_im = data_im + img_offset;
-	int index = get_global_id(0);
-	if(index < n) {
-		T val = 0;
-		int w = index % width + pad_w;
-		int h = (index / width) % height + pad_h;
-		int c = index / (width * height);
-		// compute the start and end of the output
-		int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-		int w_col_end = min(w / stride_w + 1, width_col);
-		int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-		int h_col_end = min(h / stride_h + 1, height_col);
-		// equivalent implementation
-		int offset =
-		(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-		int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-		int coeff_w_col = (1 - stride_w * height_col * width_col);
-		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-			}
-		}
-		data_im[index] = val;
-	}
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset) {
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  int index = get_global_id(0);
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height);
+    // compute the start and end of the output
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+    // equivalent implementation
+    int offset =
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
 }
 
 template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,const int height_col, const int width_col,
-		__global float* data_im, const int img_offset);
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,const int height_col, const int width_col,
+    __global float* data_im, const int img_offset);
 template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
-		const int col_offset, const int height, const int width, const int channels,
-		const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+    const int col_offset, const int height, const int width, const int channels,
+    const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
 __kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) {
-	int index = get_global_id(0);
-	data_col = data_col + col_offset;
-	data_im = data_im + img_offset;
-	if(index < n) {
-		T val = 0;
-		int w = index % width + pad;
-		int h = (index / width) % height + pad;
-		int c = index / (width * height);
-		// compute the start and end of the output
-		int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-		int w_col_end = min(w / stride + 1, width_col);
-		int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-		int h_col_end = min(h / stride + 1, height_col);
-		// equivalent implementation
-		int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-		int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-		int coeff_w_col = (1 - stride * height_col * width_col);
-		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-			}
-		}
-		data_im[index] = val;
-	}
+  int index = get_global_id(0);
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad;
+    int h = (index / width) % height + pad;
+    int c = index / (width * height);
+    // compute the start and end of the output
+    int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+    int w_col_end = min(w / stride + 1, width_col);
+    int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+    int h_col_end = min(h / stride + 1, height_col);
+    // equivalent implementation
+    int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
+    int coeff_w_col = (1 - stride * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
 }
 template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset);
 template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
 __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
-	int index = get_global_id(0);
-	data_col = data_col + col_offset;
-	data_im = data_im + img_offset;
-	if(index < n) {
-		T val = 0;
-		int w = index % width + pad;
-		int h = (index / width) % height + pad;
-		int c = index / (width * height) % channels;
-		int im = index / width / height / channels;
-		// compute the start and end of the output
-		int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-		int w_col_end = min(w / stride + 1, width_col);
-		int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-		int h_col_end = min(h / stride + 1, height_col);
-		// equivalent implementation
-		int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
-		int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
-		int coeff_w_col = (1 - stride * height_col * width_col * optnum);
-		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-			}
-		}
-		data_im[index] = val;
-	}
+  int index = get_global_id(0);
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad;
+    int h = (index / width) % height + pad;
+    int c = index / (width * height) % channels;
+    int im = index / width / height / channels;
+    // compute the start and end of the output
+    int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
+    int w_col_end = min(w / stride + 1, width_col);
+    int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
+    int h_col_end = min(h / stride + 1, height_col);
+    // equivalent implementation
+    int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
+    int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
+    int coeff_w_col = (1 - stride * height_col * width_col * optnum);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
 }
 template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum);
 template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum);
@@ -245,46 +245,46 @@ template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_o
 template <class T>
 __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) {
 
-	int index = get_global_id(0);
-	data_opt = data_opt + opt_offset;
-	data_im = data_im + im_offset;
-	if(index < n) {
-		int w = index % width;
-		int h = (index / width) % height;
-		int c = index / (width * height) % channels;
-		int im = index / width / height / channels;
+  int index = get_global_id(0);
+  data_opt = data_opt + opt_offset;
+  data_im = data_im + im_offset;
+  if(index < n) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = index / (width * height) % channels;
+    int im = index / width / height / channels;
 
-		int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
-		data_opt[opt_index] = data_im[index];
-	}
+    int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
+    data_opt[opt_index] = data_im[index];
+  }
 }
 template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum);
 template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum);
 
 template <class T>
 __kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) {
-	int gidx = get_global_id(0);
-	int gidy = get_global_id(1);
-	int gidyy = gidy;
-	int index = gidy / height;
-	int offset = index * width * height;
-	gidy = gidy % height;
-	if( gidx < width && gidyy < height * optnum )
-	dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+  int gidx = get_global_id(0);
+  int gidy = get_global_id(1);
+  int gidyy = gidy;
+  int index = gidy / height;
+  int offset = index * width * height;
+  gidy = gidy % height;
+  if( gidx < width && gidyy < height * optnum )
+  dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
 }
 template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum);
 template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
 
 template <class T>
 __kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) {
-	int gidx = get_global_id(0);
-	int index;
-	index = (optnum==1) ? 0: gidx % optnum;
-	dst = dst + top_offset; // now we point at (*top)[n]
-	int offset = gidx / optnum;
-	int i = 0;
-	for(i = 0; i < width; i++)
-	dst[(index * height + offset)* width + i] = src[gidx * width + i];
+  int gidx = get_global_id(0);
+  int index;
+  index = (optnum==1) ? 0: gidx % optnum;
+  dst = dst + top_offset; // now we point at (*top)[n]
+  int offset = gidx / optnum;
+  int i = 0;
+  for(i = 0; i < width; i++)
+  dst[(index * height + offset)* width + i] = src[gidx * width + i];
 }
 template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum);
 template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum);
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
index e9938966..67eed4ae 100644
--- a/src/caffe/ocl/lrn_layer.cl
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -26,113 +26,113 @@
 
 template <class T>
 __kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index += tmp)
-	out[index] = in[index] * pow(scale[index], negative_beta);
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp)
+  out[index] = in[index] * pow(scale[index], negative_beta);
 }
 template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
 template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
 
 template <class T>
 __kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index += tmp) {
-		// find out the local offset
-		const int w = index % width;
-		const int h = (index / width) % height;
-		const int n = index / width / height;
-		const int offset = (n * channels * height + h) * width + w;
-		const int step = height * width;
-		in = in + offset;
-		scale = scale + offset;
-		int head = 0;
-		const int pre_pad = (size - 1) / 2;
-		const int post_pad = size - pre_pad - 1;
-		T accum_scale = 0;
-		// fill the scale at [n, :, h, w]
-		// accumulate values
-		while (head < post_pad && head < channels) {
-			accum_scale += in[head * step] * in[head * step];
-			++head;
-		}
-		// both add and subtract
-		while (head < channels) {
-			accum_scale += in[head * step] * in[head * step];
-			if (head - size >= 0) {
-				accum_scale -= in[(head - size) * step]
-				* in[(head - size) * step];
-			}
-			scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-			++head;
-		}
-		// subtract only
-		while (head < channels + post_pad) {
-			if (head - size >= 0) {
-				accum_scale -= in[(head - size) * step]
-				* in[(head - size) * step];
-			}
-			scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-			++head;
-		}
-	}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
 }
 template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale);
 template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
 
 template <class T>
 __kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index += tmp) {
-		const int w = index % width;
-		const int h = (index / width) % height;
-		const int n = index / width / height;
-		const int offset = (n * channels * height + h) * width + w;
-		const int step = height * width;
-		bottom_data += offset;
-		top_data += offset;
-		scale += offset;
-		top_diff += offset;
-		bottom_diff += offset;
-		int head = 0;
-		const int pre_pad = size - (size + 1) / 2;
-		const int post_pad = size - pre_pad - 1;
-		T accum_ratio = 0;
-		// accumulate values
-		while (head < post_pad && head < channels) {
-			accum_ratio += top_diff[head * step] * top_data[head * step] /
-			scale[head * step];
-			++head;
-		}
-		// both add and subtract
-		while (head < channels) {
-			accum_ratio += top_diff[head * step] * top_data[head * step] /
-			scale[head * step];
-			if (head - size >= 0) {
-				accum_ratio -= top_diff[(head - size) * step] *
-				top_data[(head - size) * step] / scale[(head - size) * step];
-			}
-			bottom_diff[(head - post_pad) * step] =
-			top_diff[(head - post_pad) * step]
-			* pow(scale[(head - post_pad) * step], negative_beta)
-			- cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-			++head;
-		}
-		// subtract only
-		while (head < channels + post_pad) {
-			if (head - size >= 0) {
-				accum_ratio -= top_diff[(head - size) * step] *
-				top_data[(head - size) * step] / scale[(head - size) * step];
-			}
-			bottom_diff[(head - post_pad) * step] =
-			top_diff[(head - post_pad) * step]
-			* pow(scale[(head - post_pad) * step], negative_beta)
-			- cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
-			++head;
-		}
-	}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+      scale[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+      scale[head * step];
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+        top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+      top_diff[(head - post_pad) * step]
+      * pow(scale[(head - post_pad) * step], negative_beta)
+      - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+        top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+      top_diff[(head - post_pad) * step]
+      * pow(scale[(head - post_pad) * step], negative_beta)
+      - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
 }
 
 template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
index 786ddc16..49a1413a 100644
--- a/src/caffe/ocl/pooling_layer.cl
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -26,68 +26,68 @@
 
 template <class T>
 __kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index += tmp) {
-		int pw = index % pooled_width;
-		int ph = (index / pooled_width) % pooled_height;
-		int c = (index / pooled_width / pooled_height) % channels;
-		int n = index / pooled_width / pooled_height / channels;
-		int hstart = ph * stride_h - pad_h;
-		int wstart = pw * stride_w - pad_w;
-		const int hend = min(hstart + kernel_h, height);
-		const int wend = min(wstart + kernel_w, width);
-		hstart = max(hstart, 0);
-		wstart = max(wstart, 0);
-		T maxval = -FLT_MAX;
-		int maxidx = -1;
-		bottom_data =
-		bottom_data + (n * channels + c) * height * width;
-		for (int h = hstart; h < hend; ++h) {
-			for (int w = wstart; w < wend; ++w) {
-				if (bottom_data[h * width + w] > maxval) {
-					maxidx = h * width + w;
-					maxval = bottom_data[maxidx];
-				}
-			}
-		}
-		top_data[index] = maxval;
-		if (mask) {
-			mask[index] = maxidx;
-		} else {
-			top_mask[index] = maxidx;
-		}
-	}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    int maxidx = -1;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_data[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_data[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask) {
+      mask[index] = maxidx;
+    } else {
+      top_mask[index] = maxidx;
+    }
+  }
 }
 template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
 template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
 
 template <class T>
 __kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index+=tmp) {
-		int pw = index % pooled_width;
-		int ph = (index / pooled_width) % pooled_height;
-		int c = (index / pooled_width / pooled_height) % channels;
-		int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
-		int hend = min(hstart + kernel_h, height + pad_h);
-		int wend = min(wstart + kernel_w, width + pad_w);
-		const int pool_size = (hend - hstart) * (wend - wstart);
-		hstart = max(hstart, 0);
-		wstart = max(wstart, 0);
-		hend = min(hend, height);
-		wend = min(wend, width);
-		T aveval = 0;
-		bottom_data =
-		bottom_data + (n * channels + c) * height * width;
-		for (int h = hstart; h < hend; ++h) {
-			for (int w = wstart; w < wend; ++w) {
-				aveval += bottom_data[h * width + w];
-			}
-		}
-		top_data[index] = aveval / pool_size;
-	}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    T aveval = 0;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
 
 }
 template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
@@ -95,150 +95,150 @@ template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AveP
 
 template <class T>
 __kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < nthreads; index+=tmp) {
-		const int pw = index % pooled_width;
-		const int ph = (index / pooled_width) % pooled_height;
-		const int c = (index / pooled_width / pooled_height) % channels;
-		const int n = index / pooled_width / pooled_height / channels;
-		const int hstart = ph * stride_h;
-		const int hend = min(hstart + kernel_h, height);
-		const int wstart = pw * stride_w;
-		const int wend = min(wstart + kernel_w, width);
-		T cumsum = 0.;
-		bottom_data = bottom_data + (n * channels + c) * height * width;
-		// First pass: get sum
-		for (int h = hstart; h < hend; ++h) {
-			for (int w = wstart; w < wend; ++w) {
-				cumsum += bottom_data[h * width + w];
-			}
-		}
-		const float thres = rand_idx[index] * cumsum;
-		// Second pass: get value, and set index.
-		cumsum = 0;
-		for (int h = hstart; h < hend; ++h) {
-			for (int w = wstart; w < wend; ++w) {
-				cumsum += bottom_data[h * width + w];
-				if (cumsum >= thres) {
-					rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-					top_data[index] = bottom_data[h * width + w];
-					return;
-				}
-			}
-		}
-	}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    T cumsum = 0.;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+      }
+    }
+    const float thres = rand_idx[index] * cumsum;
+    // Second pass: get value, and set index.
+    cumsum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        if (cumsum >= thres) {
+          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
+          top_data[index] = bottom_data[h * width + w];
+          return;
+        }
+      }
+    }
+  }
 }
 template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
 template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
 
 template <class T>
 __kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) {
-	int index = get_global_id(0);
-	int tmp = get_global_size(0);
-	for(index; index < count; index+=tmp) {
-		const int pw = index % pooled_width;
-		const int ph = (index / pooled_width) % pooled_height;
-		const int c = (index / pooled_width / pooled_height) % channels;
-		const int n = index / pooled_width / pooled_height / channels;
-		const int hstart = ph * stride_h;
-		const int hend = min(hstart + kernel_h, height);
-		const int wstart = pw * stride_w;
-		const int wend = min(wstart + kernel_w, width);
-		// We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
-		T cumsum = FLT_MIN;
-		T cumvalues = 0.;
-		bottom_data = bottom_data + (n * channels + c) * height * width;
-		// First pass: get sum
-		for (int h = hstart; h < hend; ++h) {
-			for (int w = wstart; w < wend; ++w) {
-				cumsum += bottom_data[h * width + w];
-				cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
-			}
-		}
-		top_data[index] = cumvalues / cumsum;}
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < count; index+=tmp) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
+    T cumsum = FLT_MIN;
+    T cumvalues = 0.;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = cumvalues / cumsum;}
 }
 template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
 template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
 
 template <class T>
 __kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
-		__global int* mask, __global T* top_mask, const int num,
-		const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-		const int pad_w, __global T* const bottom_diff) {
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	for(index; index < nthreads; index += total) {
-		// find out the local index
-		// find out the local offset
-		const int w = index % width;
-		const int h = (index / width) % height;
-		const int c = (index / width / height) % channels;
-		const int n = index / width / height / channels;
-		const int phstart =
-		(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-		const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-		const int pwstart =
-		(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-		const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-		T gradient = 0;
-		const int offset = (n * channels + c) * pooled_height * pooled_width;
-		top_diff += offset;
-		if (mask) {
-			mask = mask + offset;
-			for (int ph = phstart; ph < phend; ++ph) {
-				for (int pw = pwstart; pw < pwend; ++pw) {
-					if (mask[ph * pooled_width + pw] == h * width + w) {
-						gradient += top_diff[ph * pooled_width + pw];
-					}
-				}
-			}
-		} else {
-			top_mask = top_mask + offset;
-			for (int ph = phstart; ph < phend; ++ph) {
-				for (int pw = pwstart; pw < pwend; ++pw) {
-					if (top_mask[ph * pooled_width + pw] == h * width + w) {
-						gradient += top_diff[ph * pooled_width + pw];
-					}
-				}
-			}
-		}
-		bottom_diff[index] = gradient;
-	}
+    __global int* mask, __global T* top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, __global T* const bottom_diff) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+    (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+    (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    top_diff += offset;
+    if (mask) {
+      mask = mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    } else {
+      top_mask = top_mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (top_mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
 }
 template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
 template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
 
 template <class T>
 __kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) {
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	for(index; index < nthreads; index += total) {
-		int w = index % width + pad_w;
-		int h = (index / width) % height + pad_h;
-		int c = (index / width / height) % channels;
-		int n = index / width / height / channels;
-		const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-		const int phend = min(h / stride_h + 1, pooled_height);
-		const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-		const int pwend = min(w / stride_w + 1, pooled_width);
-		T gradient = 0;
-		top_diff += (n * channels + c) * pooled_height * pooled_width;
-		for (int ph = phstart; ph < phend; ++ph) {
-			for (int pw = pwstart; pw < pwend; ++pw) {
-				// figure out the pooling size
-				int hstart = ph * stride_h - pad_h;
-				int wstart = pw * stride_w - pad_w;
-				int hend = min(hstart + kernel_h, height + pad_h);
-				int wend = min(wstart + kernel_w, width + pad_w);
-				int pool_size = (hend - hstart) * (wend - wstart);
-				gradient += top_diff[ph * pooled_width + pw] / pool_size;
-			}
-		}
-		bottom_diff[index] = gradient;
-	}
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    top_diff += (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
 }
 
 template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
@@ -246,48 +246,48 @@ template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void Ave
 
 template <class Dtype>
 __kernel void StoPoolBackward(const int nthreads,
-		__global Dtype* rand_idx, __global Dtype* top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int pooled_height, const int pooled_width,
-		const int kernel_h, const int kernel_w, const int stride_h,
-		const int stride_w, __global Dtype* bottom_diff) {
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	for(index; index < nthreads; index += total) {
-		// find out the local index
-		// find out the local offset
-		const int w = index % width;
-		const int h = (index / width) % height;
-		const int c = (index / width / height) % channels;
-		const int n = index / width / height / channels;
-		const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-		const int phend = min(h / stride_h + 1, pooled_height);
-		const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-		const int pwend = min(w / stride_w + 1, pooled_width);
-		Dtype gradient = 0;
-		rand_idx =
-		rand_idx + (n * channels + c) * pooled_height * pooled_width;
-		top_diff =
-		top_diff + (n * channels + c) * pooled_height * pooled_width;
-		for (int ph = phstart; ph < phend; ++ph) {
-			for (int pw = pwstart; pw < pwend; ++pw) {
-				gradient += top_diff[ph * pooled_width + pw] *
-				(index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
-			}
-		}
-		bottom_diff[index] = gradient;
+    __global Dtype* rand_idx, __global Dtype* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global Dtype* bottom_diff) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    Dtype gradient = 0;
+    rand_idx =
+    rand_idx + (n * channels + c) * pooled_height * pooled_width;
+    top_diff =
+    top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        gradient += top_diff[ph * pooled_width + pw] *
+        (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
+      }
+    }
+    bottom_diff[index] = gradient;
 
-	}
+  }
 }
 template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads,
-		__global float* rand_idx, __global float* top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int pooled_height, const int pooled_width,
-		const int kernel_h, const int kernel_w, const int stride_h,
-		const int stride_w, __global float* bottom_diff);
+    __global float* rand_idx, __global float* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global float* bottom_diff);
 template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads,
-		__global double* rand_idx, __global double* top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int pooled_height, const int pooled_width,
-		const int kernel_h, const int kernel_w, const int stride_h,
-		const int stride_w, __global double* bottom_diff);
+    __global double* rand_idx, __global double* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index de46a5da..5e8c521f 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -26,35 +26,35 @@
 
 template <class T>
 __kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) {
-	int index = get_global_id(0);
-	if(index < count) {
-		int c = (index / dim) % channels / div_factor;
-		out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    int c = (index / dim) % channels / div_factor;
+    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+  }
 }
 template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor);
 template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor);
 
 template <class T>
 __kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) {
-	int index = get_global_id(0);
-	if(index < count) {
-		int c = (index / dim) % channels / div_factor;
-		out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-				+ (in_data[index] <= 0) * slope_data[c]);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    int c = (index / dim) % channels / div_factor;
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
+        + (in_data[index] <= 0) * slope_data[c]);
+  }
 }
 template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor);
 template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
 
 template <class T>
 __kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) {
-	int index = get_global_id(0);
-	if(index < count) {
-		in_diff += offset_out;
-		out_diff += offset_in;
-		out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    in_diff += offset_out;
+    out_diff += offset_in;
+    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+  }
 }
 template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff);
 template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index 438931ec..7f8bc5b3 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -30,720 +30,817 @@
 //we use the open sourced threefry's GPU implementation
 typedef uint uint32_t;
 
-struct r123array4x32 {	uint32_t v[4]; };
-
-enum r123_enum_threefry32x4 
-{
-	R_32x4_0_0 = 10, R_32x4_0_1 = 26,
-	R_32x4_1_0 = 11, R_32x4_1_1 = 21,
-	R_32x4_2_0 = 13, R_32x4_2_1 = 27,
-	R_32x4_3_0 = 23, R_32x4_3_1 =  5,
-	R_32x4_4_0 =  6, R_32x4_4_1 = 20,
-	R_32x4_5_0 = 17, R_32x4_5_1 = 11,
-	R_32x4_6_0 = 25, R_32x4_6_1 = 10,
-	R_32x4_7_0 = 18, R_32x4_7_1 = 20
+struct r123array4x32 {
+    uint32_t v[4];
+};
+
+enum r123_enum_threefry32x4 {
+  R_32x4_0_0 = 10,
+  R_32x4_0_1 = 26,
+  R_32x4_1_0 = 11,
+  R_32x4_1_1 = 21,
+  R_32x4_2_0 = 13,
+  R_32x4_2_1 = 27,
+  R_32x4_3_0 = 23,
+  R_32x4_3_1 = 5,
+  R_32x4_4_0 = 6,
+  R_32x4_4_1 = 20,
+  R_32x4_5_0 = 17,
+  R_32x4_5_1 = 11,
+  R_32x4_6_0 = 25,
+  R_32x4_6_1 = 10,
+  R_32x4_7_0 = 18,
+  R_32x4_7_1 = 20
 };
 
-inline uint32_t	RotL_32(uint32_t x, unsigned int N)__attribute__((always_inline));
 inline uint32_t RotL_32(uint32_t x, unsigned int N)
-{
-	return (x << (N & 31)) | (x >> ((32 - N) & 31));
+    __attribute__((always_inline));
+inline uint32_t RotL_32(uint32_t x, unsigned int N) {
+  return (x << (N & 31)) | (x >> ((32 - N) & 31));
 }
 
 typedef struct r123array4x32 threefry4x32_ctr_t;
 typedef struct r123array4x32 threefry4x32_key_t;
 typedef struct r123array4x32 threefry4x32_ukey_t;
 
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)__attribute__((always_inline));
-inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k)
-{
-	threefry4x32_ctr_t	X;
-	uint32_t			ks[4 + 1];
-	int					i;
-	ks[4] = 0x1BD11BDA;
-	/*
-	for (i = 0; i < 4; i++)
-	{
-		ks[i] = k.v[i];
-		X.v[i] = in.v[i];
-		ks[4] ^= k.v[i];
-	}*/ 
-	{
-		ks[0] = k.v[0];
-		X.v[0] = in.v[0];
-		ks[4] ^= k.v[0];
-
-		ks[1] = k.v[1];
-		X.v[1] = in.v[1];
-		ks[4] ^= k.v[1];
-
-		ks[2] = k.v[2];
-		X.v[2] = in.v[2];
-		ks[4] ^= k.v[2];
-
-		ks[3] = k.v[3];
-		X.v[3] = in.v[3];
-		ks[4] ^= k.v[3];
-	}
-	X.v[0] += ks[0];
-	X.v[1] += ks[1];
-	X.v[2] += ks[2];
-	X.v[3] += ks[3];
-	if (Nrounds > 0) 
-	{
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 1) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 2) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 3) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 3) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 1;
-	} if (Nrounds > 4) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 5) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 6) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 7) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 7) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 2;
-	} if (Nrounds > 8) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 9) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 10) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 11) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 11) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 3;
-	} if (Nrounds > 12) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 13) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 14) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 15) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 15) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 4;
-	} if (Nrounds > 16) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 17) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 18) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 19) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 19) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 5;
-	} if (Nrounds > 20) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 21) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 22) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 23) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 23) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 6;
-	} if (Nrounds > 24) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 25) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 26) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 27) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 27) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 7;
-	} if (Nrounds > 28) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 29) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 30) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 31) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 31) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 8;
-	} if (Nrounds > 32) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 33) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 34) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 35) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 35) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 9;
-	} if (Nrounds > 36) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 37) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 38) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 39) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 39) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 10;
-	} if (Nrounds > 40) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 41) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 42) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 43) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 43) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 11;
-	} if (Nrounds > 44) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 45) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 46) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 47) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 47) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 12;
-	} if (Nrounds > 48) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 49) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 50) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 51) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 51) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 13;
-	} if (Nrounds > 52) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 53) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 54) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 55) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 55) {
-		X.v[0] += ks[4];
-		X.v[1] += ks[0];
-		X.v[2] += ks[1];
-		X.v[3] += ks[2];
-		X.v[4 - 1] += 14;
-	} if (Nrounds > 56) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 57) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 58) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 59) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 59) {
-		X.v[0] += ks[0];
-		X.v[1] += ks[1];
-		X.v[2] += ks[2];
-		X.v[3] += ks[3];
-		X.v[4 - 1] += 15;
-	} if (Nrounds > 60) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 61) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 62) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 63) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 63) {
-		X.v[0] += ks[1];
-		X.v[1] += ks[2];
-		X.v[2] += ks[3];
-		X.v[3] += ks[4];
-		X.v[4 - 1] += 16;
-	} if (Nrounds > 64) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 65) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 66) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 67) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 67) {
-		X.v[0] += ks[2];
-		X.v[1] += ks[3];
-		X.v[2] += ks[4];
-		X.v[3] += ks[0];
-		X.v[4 - 1] += 17;
-	} if (Nrounds > 68) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 69) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 70) {
-		X.v[0] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
-		X.v[1] ^= X.v[0];
-		X.v[2] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
-		X.v[3] ^= X.v[2];
-	} if (Nrounds > 71) {
-		X.v[0] += X.v[3];
-		X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
-		X.v[3] ^= X.v[0];
-		X.v[2] += X.v[1];
-		X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
-		X.v[1] ^= X.v[2];
-	} if (Nrounds > 71) {
-		X.v[0] += ks[3];
-		X.v[1] += ks[4];
-		X.v[2] += ks[0];
-		X.v[3] += ks[1];
-		X.v[4 - 1] += 18;
-	} 
-	return X;
-} 
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+    threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+    threefry4x32_ctr_t in, threefry4x32_key_t k) {
+  threefry4x32_ctr_t X;
+  uint32_t ks[4 + 1];
+  int i;
+  ks[4] = 0x1BD11BDA;
+  /*
+   for (i = 0; i < 4; i++)
+   {
+   ks[i] = k.v[i];
+   X.v[i] = in.v[i];
+   ks[4] ^= k.v[i];
+   }*/
+  {
+    ks[0] = k.v[0];
+    X.v[0] = in.v[0];
+    ks[4] ^= k.v[0];
+
+    ks[1] = k.v[1];
+    X.v[1] = in.v[1];
+    ks[4] ^= k.v[1];
+
+    ks[2] = k.v[2];
+    X.v[2] = in.v[2];
+    ks[4] ^= k.v[2];
+
+    ks[3] = k.v[3];
+    X.v[3] = in.v[3];
+    ks[4] ^= k.v[3];
+  }
+  X.v[0] += ks[0];
+  X.v[1] += ks[1];
+  X.v[2] += ks[2];
+  X.v[3] += ks[3];
+  if (Nrounds > 0) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 1) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 2) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 3) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 3) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 1;
+  }
+  if (Nrounds > 4) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 5) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 6) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 7) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 7) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 2;
+  }
+  if (Nrounds > 8) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 9) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 10) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 11) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 11) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 3;
+  }
+  if (Nrounds > 12) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 13) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 14) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 15) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 15) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 4;
+  }
+  if (Nrounds > 16) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 17) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 18) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 19) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 19) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 5;
+  }
+  if (Nrounds > 20) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 21) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 22) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 23) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 23) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 6;
+  }
+  if (Nrounds > 24) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 25) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 26) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 27) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 27) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 7;
+  }
+  if (Nrounds > 28) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 29) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 30) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 31) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 31) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 8;
+  }
+  if (Nrounds > 32) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 33) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 34) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 35) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 35) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 9;
+  }
+  if (Nrounds > 36) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 37) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 38) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 39) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 39) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 10;
+  }
+  if (Nrounds > 40) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 41) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 42) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 43) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 43) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 11;
+  }
+  if (Nrounds > 44) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 45) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 46) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 47) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 47) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 12;
+  }
+  if (Nrounds > 48) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 49) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 50) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 51) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 51) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 13;
+  }
+  if (Nrounds > 52) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 53) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 54) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 55) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 55) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 14;
+  }
+  if (Nrounds > 56) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 57) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 58) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 59) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 59) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 15;
+  }
+  if (Nrounds > 60) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 61) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 62) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 63) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 63) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 16;
+  }
+  if (Nrounds > 64) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 65) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 66) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 67) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 67) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 17;
+  }
+  if (Nrounds > 68) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 69) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 70) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 71) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 71) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 18;
+  }
+  return X;
+}
 
 template <class T>
 __kernel void PRNG_threefry4x32_bernoulli(
-        __global uint4 *randomnumber,
-        threefry4x32_ctr_t ctr_i,
-        T inf,
-        T sup,
-        T threshold,
-        uint nrounds,
-        uint numrandom
-){
-        size_t  gdx = get_global_id(0);
-
-        uint maxUint = 0;
-        maxUint--;
-        float r = (float)maxUint;
-
-        threefry4x32_ctr_t      ctr = ctr_i; 
-        threefry4x32_ukey_t ukey;
-
-        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
-
-        threefry4x32_ctr_t  random4;
-
-        if ( gdx < numrandom )
-        {
-                random4 = threefry4x32_R(nrounds, ctr, ukey);
-                uint4 frnd;
-				
-                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
-				
-                randomnumber[gdx] = frnd;
-        }
+    __global uint4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T inf,
+    T sup,
+    T threshold,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+
+    randomnumber[gdx] = frnd;
+  }
 }
 
-
 template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
 
 template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
@@ -752,133 +849,130 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t
 
 template <class T>
 __kernel void PRNG_threefry4x32_uniform(
-        __global float4 *randomnumber,
-        threefry4x32_ctr_t ctr_i,
-        T inf,
-        T sup,
-        uint nrounds,
-        uint numrandom
-){
-        size_t  gdx = get_global_id(0);
-
-        uint maxUint = 0;
-        maxUint--;
-        float r = (float)maxUint;
-
-        threefry4x32_ctr_t      ctr = ctr_i; 
-        threefry4x32_ukey_t ukey;
-
-        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
-
-        threefry4x32_ctr_t  random4;
-
-        if ( gdx < numrandom )
-        {
-                random4 = threefry4x32_R(nrounds, ctr, ukey);
-                float4 frnd;
-                frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
-                frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
-                frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
-                frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
-                randomnumber[gdx] = frnd;
-        }
+    __global float4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T inf,
+    T sup,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
+    randomnumber[gdx] = frnd;
+  }
 }
 
 template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm);
 
 template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm);
 
-
 __kernel void PRNG_threefry4x32_uint_uniform(
-        __global uint4 *randomnumber,
-        threefry4x32_ctr_t ctr_i,
-        uint inf,
-        uint sup,
-        uint nrounds,
-        uint numrandom
-){
-        size_t  gdx = get_global_id(0);
-
-        threefry4x32_ctr_t      ctr = ctr_i; 
-        threefry4x32_ukey_t ukey;
-
-        ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
-
-        threefry4x32_ctr_t  random4;
-
-        if ( gdx < numrandom )
-        {
-                random4 = threefry4x32_R(nrounds, ctr, ukey);
-                uint4 frnd;
-                frnd.x =  random4.v[0] % (sup - inf) + inf;
-                frnd.y =  random4.v[1] % (sup - inf) + inf;
-                frnd.z =  random4.v[2] % (sup - inf) + inf;
-                frnd.w =  random4.v[3] % (sup - inf) + inf;
-                randomnumber[gdx] = frnd;
-        }
+    __global uint4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    uint inf,
+    uint sup,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+    frnd.x = random4.v[0] % (sup - inf) + inf;
+    frnd.y = random4.v[1] % (sup - inf) + inf;
+    frnd.z = random4.v[2] % (sup - inf) + inf;
+    frnd.w = random4.v[3] % (sup - inf) + inf;
+    randomnumber[gdx] = frnd;
+  }
 }
 
-
 template <class T>
 __kernel void PRNG_threefry4x32_gaussian(
-	__global float4 *randomnumber, 
-	threefry4x32_ctr_t ctr_i,
-	T E,
-	T V,
-	uint nrounds,
-	uint numrandom
-){
-	size_t	gdx = get_global_id(0);
-
-	uint maxUint = 0;
-	maxUint--;
-	float r = (float)maxUint;
-
-	threefry4x32_ctr_t	ctr = ctr_i; 
-	threefry4x32_ukey_t ukey1, ukey2;
-
-	ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
-	ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
-
-	threefry4x32_ctr_t  random1, random2;
-
-	if ( gdx < numrandom )
-	{
-		random1 = threefry4x32_R(nrounds, ctr, ukey1);
-		random2 = threefry4x32_R(nrounds, ctr, ukey2);
-		float4 frnd1;
-
-		float r1 = (((float)random1.v[0]) / r);          // generate a random sequence of uniform distribution
-		float r2 = (((float)random2.v[0]) / r);
-		float r3 = (((float)random1.v[1]) / r);
-		float r4 = (((float)random2.v[1]) / r);
-		float r5 = (((float)random1.v[2]) / r);
-		float r6 = (((float)random2.v[2]) / r);
-		float r7 = (((float)random1.v[3]) / r);
-		float r8 = (((float)random2.v[3]) / r);
-
-		if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0){
-			r2 += 0.0001;
-			r4 += 0.0001;
-			r6 += 0.0001;
-			r8 += 0.0001;
-		}
-
-		frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
-		//frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
-		frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
-		//frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
-		frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
-		//frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
-		frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;     // return a pseudo sequence of normal distribution using two above uniform noise data
-		//frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
-
-		randomnumber[gdx] = frnd1;
-	}
+    __global float4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T E,
+    T V,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey1, ukey2;
+
+  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
+  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
+
+  threefry4x32_ctr_t random1, random2;
+
+  if ( gdx < numrandom )
+  {
+    random1 = threefry4x32_R(nrounds, ctr, ukey1);
+    random2 = threefry4x32_R(nrounds, ctr, ukey2);
+    float4 frnd1;
+
+    float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution
+    float r2 = (((float)random2.v[0]) / r);
+    float r3 = (((float)random1.v[1]) / r);
+    float r4 = (((float)random2.v[1]) / r);
+    float r5 = (((float)random1.v[2]) / r);
+    float r6 = (((float)random2.v[2]) / r);
+    float r7 = (((float)random1.v[3]) / r);
+    float r8 = (((float)random2.v[3]) / r);
+
+    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {
+      r2 += 0.0001;
+      r4 += 0.0001;
+      r6 += 0.0001;
+      r8 += 0.0001;
+    }
+
+    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+
+    randomnumber[gdx] = frnd1;
+  }
 }
 
 template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm);
 
 template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm);
 
-
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
index cf9302d5..e39aa426 100644
--- a/src/caffe/ocl/relu_layer.cl
+++ b/src/caffe/ocl/relu_layer.cl
@@ -26,9 +26,9 @@
 
 template <class T>
 __kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) {
-	int index = get_global_id(0);
-	if(index < count)
-	out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
 }
 
 template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
@@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUFo
 
 template <class T>
 __kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
+  }
 }
 
 template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl
index a3a9345f..ac0ef9a9 100644
--- a/src/caffe/ocl/sigmoid_layer.cl
+++ b/src/caffe/ocl/sigmoid_layer.cl
@@ -26,9 +26,9 @@
 
 template <class T>
 __kernel void SigmoidForward(const int count, __global T* in, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count)
-	out[index] = 1. / (1. + exp(-in[index]));
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = 1. / (1. + exp(-in[index]));
 }
 
 template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out);
@@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void Sig
 
 template <class T>
 __kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
-	int index = get_global_id(0);
-	const T sigmoid_x = out_data[index];
-	if(index < count)
-	out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
+  int index = get_global_id(0);
+  const T sigmoid_x = out_data[index];
+  if(index < count)
+  out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
 }
 
 template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
index 6fe0daab..207f0058 100644
--- a/src/caffe/ocl/softmax_layer.cl
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -27,47 +27,47 @@
 template <class T>
 __kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) {
 
-	int gid = get_global_id(0);
-	int size = get_global_size(0);
+  int gid = get_global_id(0);
+  int size = get_global_size(0);
 
-	resultScratch[gid] = 0.0;
-	for(int i = gid; i < num; i += size) {
-		resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+  resultScratch[gid] = 0.0;
+  for(int i = gid; i < num; i += size) {
+    resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
 
-	if(gid < 128)
-	resultScratch[gid] += resultScratch[gid + 128];
-	barrier(CLK_LOCAL_MEM_FENCE);
-	if(gid < 64)
-	resultScratch[gid] += resultScratch[gid + 64];
-	if(gid < 32)
-	resultScratch[gid] += resultScratch[gid + 32];
-	if(gid < 16)
-	resultScratch[gid] += resultScratch[gid + 16];
-	if(gid < 8)
-	resultScratch[gid] += resultScratch[gid + 8];
-	if(gid < 4)
-	resultScratch[gid] += resultScratch[gid + 4];
-	if(gid < 2)
-	resultScratch[gid] += resultScratch[gid + 2];
-	if(gid < 1) {
-		resultScratch[gid] += resultScratch[gid + 1];
-		loss[0] = resultScratch[gid];
-	}
+  if(gid < 128)
+  resultScratch[gid] += resultScratch[gid + 128];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if(gid < 64)
+  resultScratch[gid] += resultScratch[gid + 64];
+  if(gid < 32)
+  resultScratch[gid] += resultScratch[gid + 32];
+  if(gid < 16)
+  resultScratch[gid] += resultScratch[gid + 16];
+  if(gid < 8)
+  resultScratch[gid] += resultScratch[gid + 8];
+  if(gid < 4)
+  resultScratch[gid] += resultScratch[gid + 4];
+  if(gid < 2)
+  resultScratch[gid] += resultScratch[gid + 2];
+  if(gid < 1) {
+    resultScratch[gid] += resultScratch[gid + 1];
+    loss[0] = resultScratch[gid];
+  }
 }
 template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
 template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
 
 template <class T>
 __kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) {
-	//printf("softmax_div\n");
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	for(index; index < num*dim; index += total) {
-		int n = index / dim;
-		data[index] /= scale[n];
-	}
+  //printf("softmax_div\n");
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < num*dim; index += total) {
+    int n = index / dim;
+    data[index] /= scale[n];
+  }
 }
 
 template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
@@ -75,97 +75,97 @@ template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softma
 
 template <class T>
 __kernel void kernel_channel_max(const int num, const int channels,
-		const int spatial_dim, __global const T* data, __global T* out) {
-	int index = get_global_id(0);
-	if(index < num * spatial_dim) {
-		int n = index / spatial_dim;
-		int s = index % spatial_dim;
-		T maxval = -FLT_MAX;
-		for (int c = 0; c < channels; ++c) {
-			maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-		}
-		out[index] = maxval;
-	}
+    const int spatial_dim, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
-		const int spatial_dim, __global const float* data, __global float* out);
+    const int spatial_dim, __global const float* data, __global float* out);
 template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
-		const int spatial_dim, __global const double* data, __global double* out);
+    const int spatial_dim, __global const double* data, __global double* out);
 
 template <class T>
 __kernel void kernel_channel_subtract(const int count,
-		const int num, const int channels,
-		const int spatial_dim, __global const T* channel_max, __global T* data) {
-	int index = get_global_id(0);
-	if(index < count) {
-		int n = index / channels / spatial_dim;
-		int s = index % spatial_dim;
-		data[index] -= channel_max[n * spatial_dim + s];
-	}
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
 }
 template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
 template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
 
 template <class T>
 __kernel void kernel_channel_sum(const int num, const int channels,
-		const int spatial_dim, __global const T* data, __global T* channel_sum) {
-	int index = get_global_id(0);
-	if(index < num * spatial_dim) {
-		int n = index / spatial_dim;
-		int s = index % spatial_dim;
-		T sum = 0;
-		for (int c = 0; c < channels; ++c) {
-			sum += data[(n * channels + c) * spatial_dim + s];
-		}
-		channel_sum[index] = sum;
-	}
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
-		const int spatial_dim, __global const float* data, __global float* channel_sum);
+    const int spatial_dim, __global const float* data, __global float* channel_sum);
 template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
-		const int spatial_dim, __global const double* data, __global double* channel_sum);
+    const int spatial_dim, __global const double* data, __global double* channel_sum);
 
 template <class T>
 __kernel void kernel_channel_div(const int count,
-		const int num, const int channels,
-		const int spatial_dim, __global const T* channel_sum, __global T* data) {
-	int index = get_global_id(0);
-	if(index < count) {
-		int n = index / channels / spatial_dim;
-		int s = index % spatial_dim;
-		data[index] /= channel_sum[n * spatial_dim + s];
-	}
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
-		const int num, const int channels,
-		const int spatial_dim, __global const float* channel_sum, __global float* data);
+    const int num, const int channels,
+    const int spatial_dim, __global const float* channel_sum, __global float* data);
 template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
-		const int num, const int channels,
-		const int spatial_dim, __global const double* channel_sum, __global double* data);
+    const int num, const int channels,
+    const int spatial_dim, __global const double* channel_sum, __global double* data);
 
 template <class T>
 __kernel void kernel_channel_dot(const int num, const int channels,
-		const int spatial_dim, __global const T* data_1, __global const T* data_2,
-		__global T* channel_dot) {
-	int index = get_global_id(0);
-	if(index < num * spatial_dim) {
-		int n = index / spatial_dim;
-		int s = index % spatial_dim;
-		T dot = 0;
-		for (int c = 0; c < channels; ++c) {
-			dot += (data_1[(n * channels + c) * spatial_dim + s]
-					* data_2[(n * channels + c) * spatial_dim + s]);
-		}
-		channel_dot[index] = dot;
-	}
+    const int spatial_dim, __global const T* data_1, __global const T* data_2,
+    __global T* channel_dot) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T dot = 0;
+    for (int c = 0; c < channels; ++c) {
+      dot += (data_1[(n * channels + c) * spatial_dim + s]
+          * data_2[(n * channels + c) * spatial_dim + s]);
+    }
+    channel_dot[index] = dot;
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
-		const int spatial_dim, __global const float* data_1, __global const float* data_2,
-		__global float* channel_dot);
+    const int spatial_dim, __global const float* data_1, __global const float* data_2,
+    __global float* channel_dot);
 template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
-		const int spatial_dim, __global const double* data_1, __global const double* data_2,
-		__global double* channel_dot);
+    const int spatial_dim, __global const double* data_1, __global const double* data_2,
+    __global double* channel_dot);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
index 70c282e1..731f660c 100644
--- a/src/caffe/ocl/softmaxwithloss_layer.cl
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -26,77 +26,77 @@
 
 template <class T>
 __kernel void SoftmaxLossForwardGPU(const int nthreads,
-		__global T* prob_data, __global T* label,__global T* loss,
-		int num, int dim, int spatial_dim,
-		bool has_ignore_label_, int ignore_label_,
-		__global T* counts) {
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		const int n = index / spatial_dim;
-		const int s = index % spatial_dim;
-		const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-		if (has_ignore_label_ && label_value == ignore_label_) {
-			loss[index] = 0;
-			counts[index] = 0;
-		} else {
-			loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-							T(FLT_MIN)));
-			counts[index] = 1;
-		}
-	}
+    __global T* prob_data, __global T* label,__global T* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global T* counts) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      loss[index] = 0;
+      counts[index] = 0;
+    } else {
+      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+              T(FLT_MIN)));
+      counts[index] = 1;
+    }
+  }
 }
 
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-		__global float* prob_data, __global float* label,__global float* loss,
-		int num, int dim, int spatial_dim,
-		bool has_ignore_label_, int ignore_label_,
-		__global float* counts);
+    __global float* prob_data, __global float* label,__global float* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global float* counts);
 template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
-		__global double* prob_data, __global double* label,__global double* loss,
-		int num, int dim, int spatial_dim,
-		bool has_ignore_label_, int ignore_label_,
-		__global double* counts);
+    __global double* prob_data, __global double* label,__global double* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global double* counts);
 
 template <class T>
 __kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
-		__global T* label,__global T* bottom_diff, int num, int dim,
-		int spatial_dim, bool has_ignore_label_,
-		int ignore_label_, T* counts) {
-	const int channels = dim / spatial_dim;
-	int index = get_global_id(0);
-	if(index < nthreads) {
-		const int n = index / spatial_dim;
-		const int s = index % spatial_dim;
-		const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    __global T* label,__global T* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, T* counts) {
+  const int channels = dim / spatial_dim;
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
 
-		if (has_ignore_label_ && label_value == ignore_label_) {
-			for (int c = 0; c < channels; ++c) {
-				bottom_diff[n * dim + c * spatial_dim + s] = 0;
-			}
-			counts[index] = 0;
-		} else {
-			bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-			counts[index] = 1;
-		}
-	}
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      for (int c = 0; c < channels; ++c) {
+        bottom_diff[n * dim + c * spatial_dim + s] = 0;
+      }
+      counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
 }
 template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
-		__global float* label,__global float* bottom_diff, int num, int dim,
-		int spatial_dim, bool has_ignore_label_,
-		int ignore_label_, float* counts);
+    __global float* label,__global float* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, float* counts);
 
 template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
-		__global double* label,__global double* bottom_diff, int num, int dim,
-		int spatial_dim, bool has_ignore_label_,
-		int ignore_label_, double* counts);
+    __global double* label,__global double* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, double* counts);
 
 template <class T>
 __kernel void scal (const int num, const T alpha, __global T* data) {
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	for(index; index < num; index += total) {
-		data[index] = data[index] * alpha;
-	}
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < num; index += total) {
+    data[index] = data[index] * alpha;
+  }
 }
 
 template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data);
diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl
index a8bd05c9..900f11ea 100644
--- a/src/caffe/ocl/tanh_layer.cl
+++ b/src/caffe/ocl/tanh_layer.cl
@@ -26,9 +26,9 @@
 
 template <class T>
 __kernel void TanHForward(const int count, __global T* in, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count)
-	out[index] =tanh(in[index]);
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] =tanh(in[index]);
 }
 
 template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out);
@@ -36,10 +36,10 @@ template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHFo
 
 template <class T>
 __kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
-	int index = get_global_id(0);
-	const T tanhx = out_data[index];
-	if(index < count)
-	out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
+  int index = get_global_id(0);
+  const T tanhx = out_data[index];
+  if(index < count)
+  out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
 }
 
 template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl
index 19df83e2..679dbf29 100644
--- a/src/caffe/ocl/threshold_layer.cl
+++ b/src/caffe/ocl/threshold_layer.cl
@@ -26,9 +26,9 @@
 
 template <class T>
 __kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count)
-	out[index] =in[index] > threshold ? 1 : 0;
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] =in[index] > threshold ? 1 : 0;
 }
 
 template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out);
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index 07a16fbd..576a6e98 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -28,10 +28,10 @@
 
 template <class T>
 __kernel void OCL_memset(__global T* buffer, const T value, const int size) {
-	int gdx = get_global_id(0);
-	if(gdx < size) {
-		buffer[gdx] = value;
-	}
+  int gdx = get_global_id(0);
+  if(gdx < size) {
+    buffer[gdx] = value;
+  }
 }
 
 template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size);
@@ -39,18 +39,18 @@ template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__
 template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
 
 __kernel void OCL_memset2(__global int* buffer, const int value, const int size) {
-	int gdx = get_global_id(0);
-	if(gdx < size) {
-		buffer[gdx] = value;
-	}
+  int gdx = get_global_id(0);
+  if(gdx < size) {
+    buffer[gdx] = value;
+  }
 }
 
 template <class T>
 __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) {
-	int gdx = get_global_id(0);
-	if(gdx < N) {
-		Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
-	}
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
+  }
 }
 
 template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
@@ -58,23 +58,23 @@ template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caff
 
 template <class T>
 __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) {
-	int index = get_global_id(0);
-	if(index < n) {
-		y[index] = fabs(a[index]);
-	}
+  int index = get_global_id(0);
+  if(index < n) {
+    y[index] = fabs(a[index]);
+  }
 }
 template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y);
 template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y);
 
 template <class T>
 __kernel void get_max(const int num, const int dim, __global T* data, __global T* out) {
-	int index = get_global_id(0);
-	if (index < num) {
-		T maxval = -FLT_MAX;
-		for (int i = 0; i < dim; i++)
-		maxval = max( data[index*dim + i], maxval );
-		out[index] = maxval;
-	}
+  int index = get_global_id(0);
+  if (index < num) {
+    T maxval = -FLT_MAX;
+    for (int i = 0; i < dim; i++)
+    maxval = max( data[index*dim + i], maxval );
+    out[index] = maxval;
+  }
 }
 
 template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
@@ -82,9 +82,9 @@ template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(co
 
 template <class T>
 __kernel void exp (const int num, __global T* data, __global T* out) {
-	int index = get_global_id(0);
-	if (index < num)
-	out[index] = exp(data[index]);
+  int index = get_global_id(0);
+  if (index < num)
+  out[index] = exp(data[index]);
 }
 
 template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
@@ -92,10 +92,10 @@ template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int
 
 template <class T>
 __kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = a[index] - b[index];
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] - b[index];
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -103,10 +103,10 @@ template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = a[index] + b[index];
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] + b[index];
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -114,10 +114,10 @@ template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = a[index] / b[index];
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] / b[index];
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -125,10 +125,10 @@ template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = a[index] * b[index];
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] * b[index];
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out);
@@ -136,10 +136,10 @@ template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = pow(data[index], alpha);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = pow(data[index], alpha);
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out);
@@ -147,10 +147,10 @@ template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel
 
 template <class T>
 __kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = exp(data[index]);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = exp(data[index]);
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
@@ -158,10 +158,10 @@ template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_
 
 template <class T>
 __kernel void kernel_add_scalar(const int count, const T data, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = out[index] + data;
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = out[index] + data;
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out);
@@ -169,10 +169,10 @@ template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void
 
 template <class T>
 __kernel void kernel_log(const int count, __global const T* data, __global T* out) {
-	int index = get_global_id(0);
-	if(index < count) {
-		out[index] = log(data[index]);
-	}
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = log(data[index]);
+  }
 }
 
 template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out);
@@ -180,13 +180,13 @@ template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_
 
 template <class T>
 __kernel void diff (const int num, const int dim, __global T* data, __global T* label) {
-	int index = get_global_id(0);
-	int total = get_global_size(0);
-	int offset;
-	for(index; index < num; index += total) {
-		offset = (int) label[index];
-		data[index * dim + offset] -= 1;
-	}
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  int offset;
+  for(index; index < num; index += total) {
+    offset = (int) label[index];
+    data[index * dim + offset] -= 1;
+  }
 }
 
 template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
@@ -194,9 +194,9 @@ template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const i
 
 template <class T>
 __kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) {
-	int index = get_global_id(0);
-	if (index < n)
-	y[index] = a[index] / b[index];
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = a[index] / b[index];
 }
 
 template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
@@ -204,9 +204,9 @@ template __attribute__ ((mangled_name(div_float))) __kernel void div (const int
 
 template <class T>
 __kernel void add_scalar (const int n, const T alpha, __global T* y) {
-	int index = get_global_id(0);
-	if (index < n)
-	y[index] += alpha;
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] += alpha;
 }
 
 template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
@@ -214,18 +214,18 @@ template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_sca
 
 template <typename Dtype>
 __kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
-	int index = get_global_id(0);
-	if (index < n)
-	y[index] = in1[index] + in2[index];
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = in1[index] + in2[index];
 }
 template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
 template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
 
 template <class T>
 __kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) {
-	int index = get_global_id(0);
-	if (index < n)
-	y[index] = a[index] * b[index];
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = a[index] * b[index];
 }
 
 template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
@@ -233,10 +233,10 @@ template __attribute__ ((mangled_name(element_mul_double))) __kernel void elemen
 
 template <class T>
 __kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) {
-	int index = get_global_id(0);
-	if (index < n)
+  int index = get_global_id(0);
+  if (index < n)
 //           y[index] = a[index] + alpha;
-	y[index] = pow(a[index], alpha);
+  y[index] = pow(a[index], alpha);
 }
 
 template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y);
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ffb77b78..8d7f8238 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -15,391 +15,387 @@ namespace caffe {
 
 template <typename Dtype>
 Solver<Dtype>::Solver(const SolverParameter& param)
-:
-		net_() {
-	Init(param);
+    : net_() {
+  Init(param);
 }
 
 template <typename Dtype>
 void Solver<Dtype>::ocl_setup() {
-	scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
-	add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
-	div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
-	powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
+  scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
+  add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
+  div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
+  powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
 }
 
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
-:
-		net_() {
-	SolverParameter param;
-	ReadProtoFromTextFileOrDie(param_file, &param);
-	Init(param);
+    : net_() {
+  SolverParameter param;
+  ReadProtoFromTextFileOrDie(param_file, &param);
+  Init(param);
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-	LOG(INFO) << "Initializing solver from parameters: " << std::endl
-			<< param.DebugString();
-	param_ = param;
-	CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-
-	ocl_setup();
-
-	if (param_.random_seed() >= 0) {
-		Caffe::set_random_seed(param_.random_seed());
-	}
-	// Scaffolding code
-	InitTrainNet();
-	InitTestNets();
-	LOG(INFO) << "Solver scaffolding done.";
-	iter_ = 0;
-	current_step_ = 0;
+  LOG(INFO) << "Initializing solver from parameters: " << std::endl
+      << param.DebugString();
+  param_ = param;
+  CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+
+  ocl_setup();
+
+  if (param_.random_seed() >= 0) {
+    Caffe::set_random_seed(param_.random_seed());
+  }
+  // Scaffolding code
+  InitTrainNet();
+  InitTestNets();
+  LOG(INFO) << "Solver scaffolding done.";
+  iter_ = 0;
+  current_step_ = 0;
 }
 
 template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
-	const int num_train_nets = param_.has_net() + param_.has_net_param() +
-			param_.has_train_net() + param_.has_train_net_param();
-	const string& field_names = "net, net_param, train_net, train_net_param";
-	CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
-			<< "using one of these fields: " << field_names;
-	CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
-			<< "one of these fields specifying a train_net: " << field_names;
-	NetParameter net_param;
-	if (param_.has_train_net_param()) {
-		LOG(INFO) << "Creating training net specified in train_net_param.";
-		net_param.CopyFrom(param_.train_net_param());
-	} else if (param_.has_train_net()) {
-		LOG(INFO) << "Creating training net from train_net file: "
-				<< param_.train_net();
-		ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
-	}
-	if (param_.has_net_param()) {
-		LOG(INFO) << "Creating training net specified in net_param.";
-		net_param.CopyFrom(param_.net_param());
-	}
-	if (param_.has_net()) {
-		LOG(INFO) << "Creating training net from net file: " << param_.net();
-		ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
-	}
-	// Set the correct NetState.  We start with the solver defaults (lowest
-	// precedence); then, merge in any NetState specified by the net_param itself;
-	// finally, merge in any NetState specified by the train_state (highest
-	// precedence).
-	NetState net_state;
-	net_state.set_phase(TRAIN);
-	net_state.MergeFrom(net_param.state());
-	net_state.MergeFrom(param_.train_state());
-	net_param.mutable_state()->CopyFrom(net_state);
-	net_.reset(new Net<Dtype>(net_param));
+  const int num_train_nets = param_.has_net() + param_.has_net_param()
+      + param_.has_train_net() + param_.has_train_net_param();
+  const string& field_names = "net, net_param, train_net, train_net_param";
+  CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
+      << "using one of these fields: " << field_names;
+  CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
+      << "one of these fields specifying a train_net: " << field_names;
+  NetParameter net_param;
+  if (param_.has_train_net_param()) {
+    LOG(INFO) << "Creating training net specified in train_net_param.";
+    net_param.CopyFrom(param_.train_net_param());
+  } else if (param_.has_train_net()) {
+    LOG(INFO) << "Creating training net from train_net file: "
+        << param_.train_net();
+    ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
+  }
+  if (param_.has_net_param()) {
+    LOG(INFO) << "Creating training net specified in net_param.";
+    net_param.CopyFrom(param_.net_param());
+  }
+  if (param_.has_net()) {
+    LOG(INFO) << "Creating training net from net file: " << param_.net();
+    ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
+  }
+  // Set the correct NetState.  We start with the solver defaults (lowest
+  // precedence); then, merge in any NetState specified by the net_param itself;
+  // finally, merge in any NetState specified by the train_state (highest
+  // precedence).
+  NetState net_state;
+  net_state.set_phase(TRAIN);
+  net_state.MergeFrom(net_param.state());
+  net_state.MergeFrom(param_.train_state());
+  net_param.mutable_state()->CopyFrom(net_state);
+  net_.reset(new Net<Dtype>(net_param));
 }
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-	const bool has_net_param = param_.has_net_param();
-	const bool has_net_file = param_.has_net();
-	const int num_generic_nets = has_net_param + has_net_file;
-	CHECK_LE(num_generic_nets, 1)
-			<< "Both net_param and net_file may not be specified.";
-	const int num_test_net_params = param_.test_net_param_size();
-	const int num_test_net_files = param_.test_net_size();
-	const int num_test_nets = num_test_net_params + num_test_net_files;
-	if (num_generic_nets) {
-		CHECK_GE(param_.test_iter_size(), num_test_nets)
-				<< "test_iter must be specified for each test network.";
-	} else {
-		CHECK_EQ(param_.test_iter_size(), num_test_nets)
-				<< "test_iter must be specified for each test network.";
-	}
-	// If we have a generic net (specified by net or net_param, rather than
-	// test_net or test_net_param), we may have an unlimited number of actual
-	// test networks -- the actual number is given by the number of remaining
-	// test_iters after any test nets specified by test_net_param and/or test_net
-	// are evaluated.
-	const int num_generic_net_instances = param_.test_iter_size() - num_test_nets;
-	const int num_test_net_instances = num_test_nets + num_generic_net_instances;
-	if (param_.test_state_size()) {
-		CHECK_EQ(param_.test_state_size(), num_test_net_instances)
-				<< "test_state must be unspecified or specified once per test net.";
-	}
-	if (num_test_net_instances) {
-		CHECK_GT(param_.test_interval(), 0);
-	}
-	int test_net_id = 0;
-	vector < string > sources(num_test_net_instances);
-	vector < NetParameter > net_params(num_test_net_instances);
-	for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
-		sources[test_net_id] = "test_net_param";
-		net_params[test_net_id].CopyFrom(param_.test_net_param(i));
-	}
-	for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
-		sources[test_net_id] = "test_net file: " + param_.test_net(i);
-		ReadNetParamsFromTextFileOrDie(param_.test_net(i),
-				&net_params[test_net_id]);
-	}
-	const int remaining_test_nets = param_.test_iter_size() - test_net_id;
-	if (has_net_param) {
-		for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-			sources[test_net_id] = "net_param";
-			net_params[test_net_id].CopyFrom(param_.net_param());
-		}
-	}
-	if (has_net_file) {
-		for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
-			sources[test_net_id] = "net file: " + param_.net();
-			ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]);
-		}
-	}
-	test_nets_.resize(num_test_net_instances);
-	for (int i = 0; i < num_test_net_instances; ++i) {
-		// Set the correct NetState.  We start with the solver defaults (lowest
-		// precedence); then, merge in any NetState specified by the net_param
-		// itself; finally, merge in any NetState specified by the test_state
-		// (highest precedence).
-		NetState net_state;
-		net_state.set_phase(TEST);
-		net_state.MergeFrom(net_params[i].state());
-		if (param_.test_state_size()) {
-			net_state.MergeFrom(param_.test_state(i));
-		}
-		net_params[i].mutable_state()->CopyFrom(net_state);
-		LOG(INFO)
-				<< "Creating test net (#" << i << ") specified by " << sources[i];
-		test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-		test_nets_[i]->set_debug_info(param_.debug_info());
-	}
+  const bool has_net_param = param_.has_net_param();
+  const bool has_net_file = param_.has_net();
+  const int num_generic_nets = has_net_param + has_net_file;
+  CHECK_LE(num_generic_nets, 1)
+      << "Both net_param and net_file may not be specified.";
+  const int num_test_net_params = param_.test_net_param_size();
+  const int num_test_net_files = param_.test_net_size();
+  const int num_test_nets = num_test_net_params + num_test_net_files;
+  if (num_generic_nets) {
+    CHECK_GE(param_.test_iter_size(), num_test_nets)
+        << "test_iter must be specified for each test network.";
+  } else {
+    CHECK_EQ(param_.test_iter_size(), num_test_nets)
+        << "test_iter must be specified for each test network.";
+  }
+  // If we have a generic net (specified by net or net_param, rather than
+  // test_net or test_net_param), we may have an unlimited number of actual
+  // test networks -- the actual number is given by the number of remaining
+  // test_iters after any test nets specified by test_net_param and/or test_net
+  // are evaluated.
+  const int num_generic_net_instances = param_.test_iter_size() - num_test_nets;
+  const int num_test_net_instances = num_test_nets + num_generic_net_instances;
+  if (param_.test_state_size()) {
+    CHECK_EQ(param_.test_state_size(), num_test_net_instances)
+        << "test_state must be unspecified or specified once per test net.";
+  }
+  if (num_test_net_instances) {
+    CHECK_GT(param_.test_interval(), 0);
+  }
+  int test_net_id = 0;
+  vector < string > sources(num_test_net_instances);
+  vector < NetParameter > net_params(num_test_net_instances);
+  for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
+    sources[test_net_id] = "test_net_param";
+    net_params[test_net_id].CopyFrom(param_.test_net_param(i));
+  }
+  for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
+    sources[test_net_id] = "test_net file: " + param_.test_net(i);
+    ReadNetParamsFromTextFileOrDie(param_.test_net(i),
+        &net_params[test_net_id]);
+  }
+  const int remaining_test_nets = param_.test_iter_size() - test_net_id;
+  if (has_net_param) {
+    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
+      sources[test_net_id] = "net_param";
+      net_params[test_net_id].CopyFrom(param_.net_param());
+    }
+  }
+  if (has_net_file) {
+    for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) {
+      sources[test_net_id] = "net file: " + param_.net();
+      ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]);
+    }
+  }
+  test_nets_.resize(num_test_net_instances);
+  for (int i = 0; i < num_test_net_instances; ++i) {
+    // Set the correct NetState.  We start with the solver defaults (lowest
+    // precedence); then, merge in any NetState specified by the net_param
+    // itself; finally, merge in any NetState specified by the test_state
+    // (highest precedence).
+    NetState net_state;
+    net_state.set_phase(TEST);
+    net_state.MergeFrom(net_params[i].state());
+    if (param_.test_state_size()) {
+      net_state.MergeFrom(param_.test_state(i));
+    }
+    net_params[i].mutable_state()->CopyFrom(net_state);
+    LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i];
+    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
+    test_nets_[i]->set_debug_info(param_.debug_info());
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Step(int iters) {
-	vector<Blob<Dtype>*> bottom_vec;
-	const int start_iter = iter_;
-	const int stop_iter = iter_ + iters;
-	int average_loss = this->param_.average_loss();
-	vector < Dtype > losses;
-	Dtype smoothed_loss = 0;
-
-	while (iter_ < stop_iter) {
-		// zero-init the params
-		for (int i = 0; i < net_->params().size(); ++i) {
-			shared_ptr < Blob<Dtype> > blob = net_->params()[i];
-			switch (Caffe::mode()) {
-				case Caffe::CPU:
-					caffe_set(blob->count(), static_cast<Dtype>(0),
-							blob->mutable_cpu_diff());
-					break;
-				case Caffe::GPU:
-					#ifndef CPU_ONLY
-					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-							blob->mutable_gpu_diff());
+  vector<Blob<Dtype>*> bottom_vec;
+  const int start_iter = iter_;
+  const int stop_iter = iter_ + iters;
+  int average_loss = this->param_.average_loss();
+  vector < Dtype > losses;
+  Dtype smoothed_loss = 0;
+
+  while (iter_ < stop_iter) {
+    // zero-init the params
+    for (int i = 0; i < net_->params().size(); ++i) {
+      shared_ptr < Blob<Dtype> > blob = net_->params()[i];
+      switch (Caffe::mode()) {
+      case Caffe::CPU:
+        caffe_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_cpu_diff());
+        break;
+      case Caffe::GPU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
 #else
-					NO_GPU;
+        NO_GPU;
 #endif
-				case Caffe::APU:
-					#ifndef CPU_ONLY
-					caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-							blob->mutable_gpu_diff());
+      case Caffe::APU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
 #else
-					NO_GPU;
+        NO_GPU;
 #endif
-					break;
-			}
-		}
-
-		if (param_.test_interval() && iter_ % param_.test_interval() == 0
-				&& (iter_ > 0 || param_.test_initialization())) {
-			TestAll();
-		}
-
-		const bool display = param_.display() && iter_ % param_.display() == 0;
-		net_->set_debug_info(display && param_.debug_info());
-		// accumulate the loss and gradient
-		Dtype loss = 0;
-		for (int i = 0; i < param_.iter_size(); ++i) {
-			loss += net_->ForwardBackward(bottom_vec);
-		}
-		loss /= param_.iter_size();
-		// average the loss across iterations for smoothed reporting
-		if (losses.size() < average_loss) {
-			losses.push_back(loss);
-			int size = losses.size();
-			smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
-		} else {
-			int idx = (iter_ - start_iter) % average_loss;
-			smoothed_loss += (loss - losses[idx]) / average_loss;
-			losses[idx] = loss;
-		}
-		if (display) {
-			LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
-			const vector<Blob<Dtype>*>& result = net_->output_blobs();
-			int score_index = 0;
-			for (int j = 0; j < result.size(); ++j) {
-				const Dtype* result_vec = result[j]->cpu_data();
-				const string& output_name =
-						net_->blob_names()[net_->output_blob_indices()[j]];
-				const Dtype loss_weight =
-						net_->blob_loss_weights()[net_->output_blob_indices()[j]];
-				for (int k = 0; k < result[j]->count(); ++k) {
-					ostringstream loss_msg_stream;
-					if (loss_weight) {
-						loss_msg_stream << " (* " << loss_weight
-								<< " = " << loss_weight * result_vec[k] << " loss)";
-					}
-					LOG(INFO) << "    Train net output #"
-							<< score_index++ << ": " << output_name << " = "
-							<< result_vec[k] << loss_msg_stream.str();
-				}
-			}
-		}
-		ApplyUpdate();
-
-		// Increment the internal iter_ counter -- its value should always indicate
-		// the number of times the weights have been updated.
-		++iter_;
-
-		// Save a snapshot if needed.
-		if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
-			Snapshot();
-		}
-	}
+        break;
+      }
+    }
+
+    if (param_.test_interval() && iter_ % param_.test_interval() == 0
+        && (iter_ > 0 || param_.test_initialization())) {
+      TestAll();
+    }
+
+    const bool display = param_.display() && iter_ % param_.display() == 0;
+    net_->set_debug_info(display && param_.debug_info());
+    // accumulate the loss and gradient
+    Dtype loss = 0;
+    for (int i = 0; i < param_.iter_size(); ++i) {
+      loss += net_->ForwardBackward(bottom_vec);
+    }
+    loss /= param_.iter_size();
+    // average the loss across iterations for smoothed reporting
+    if (losses.size() < average_loss) {
+      losses.push_back(loss);
+      int size = losses.size();
+      smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
+    } else {
+      int idx = (iter_ - start_iter) % average_loss;
+      smoothed_loss += (loss - losses[idx]) / average_loss;
+      losses[idx] = loss;
+    }
+    if (display) {
+      LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
+      const vector<Blob<Dtype>*>& result = net_->output_blobs();
+      int score_index = 0;
+      for (int j = 0; j < result.size(); ++j) {
+        const Dtype* result_vec = result[j]->cpu_data();
+        const string& output_name =
+            net_->blob_names()[net_->output_blob_indices()[j]];
+        const Dtype loss_weight =
+            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+        for (int k = 0; k < result[j]->count(); ++k) {
+          ostringstream loss_msg_stream;
+          if (loss_weight) {
+            loss_msg_stream << " (* " << loss_weight << " = "
+                << loss_weight * result_vec[k] << " loss)";
+          }
+          LOG(INFO) << "    Train net output #" << score_index++ << ": "
+              << output_name << " = " << result_vec[k] << loss_msg_stream.str();
+        }
+      }
+    }
+    ApplyUpdate();
+
+    // Increment the internal iter_ counter -- its value should always indicate
+    // the number of times the weights have been updated.
+    ++iter_;
+
+    // Save a snapshot if needed.
+    if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
+      Snapshot();
+    }
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Solve(const char* resume_file) {
-	LOG(INFO) << "Solving " << net_->name();
-	LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
-
-	if (resume_file) {
-		LOG(INFO) << "Restoring previous solver status from " << resume_file;
-		Restore(resume_file);
-	}
-
-	// For a network that is trained by the solver, no bottom or top vecs
-	// should be given, and we will just provide dummy vecs.
-	Step(param_.max_iter() - iter_);
-	// If we haven't already, save a snapshot after optimization, unless
-	// overridden by setting snapshot_after_train := false
-	if (param_.snapshot_after_train()
-			&& (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
-		Snapshot();
-	}
-	// After the optimization is done, run an additional train and test pass to
-	// display the train and test loss/outputs if appropriate (based on the
-	// display and test_interval settings, respectively).  Unlike in the rest of
-	// training, for the train net we only run a forward pass as we've already
-	// updated the parameters "max_iter" times -- this final pass is only done to
-	// display the loss, which is computed in the forward pass.
-	if (param_.display() && iter_ % param_.display() == 0) {
-		Dtype loss;
-		net_->ForwardPrefilled(&loss);
-		LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
-	}
-	if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
-		TestAll();
-	}
-	LOG(INFO) << "Optimization Done.";
+  LOG(INFO) << "Solving " << net_->name();
+  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
+
+  if (resume_file) {
+    LOG(INFO) << "Restoring previous solver status from " << resume_file;
+    Restore(resume_file);
+  }
+
+  // For a network that is trained by the solver, no bottom or top vecs
+  // should be given, and we will just provide dummy vecs.
+  Step(param_.max_iter() - iter_);
+  // If we haven't already, save a snapshot after optimization, unless
+  // overridden by setting snapshot_after_train := false
+  if (param_.snapshot_after_train()
+      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
+    Snapshot();
+  }
+  // After the optimization is done, run an additional train and test pass to
+  // display the train and test loss/outputs if appropriate (based on the
+  // display and test_interval settings, respectively).  Unlike in the rest of
+  // training, for the train net we only run a forward pass as we've already
+  // updated the parameters "max_iter" times -- this final pass is only done to
+  // display the loss, which is computed in the forward pass.
+  if (param_.display() && iter_ % param_.display() == 0) {
+    Dtype loss;
+    net_->ForwardPrefilled(&loss);
+    LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
+  }
+  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
+    TestAll();
+  }
+  LOG(INFO) << "Optimization Done.";
 }
 
 template <typename Dtype>
 void Solver<Dtype>::TestAll() {
-	for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
-		Test(test_net_id);
-	}
+  for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
+    Test(test_net_id);
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
-	LOG(INFO) << "Iteration " << iter_
-			<< ", Testing net (#" << test_net_id << ")";
-	CHECK_NOTNULL(test_nets_[test_net_id].get())->
-			ShareTrainedLayersWith(net_.get());
-	vector < Dtype > test_score;
-	vector<int> test_score_output_id;
-	vector<Blob<Dtype>*> bottom_vec;
-	const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
-	Dtype loss = 0;
-	for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
-		Dtype iter_loss;
-		const vector<Blob<Dtype>*>& result =
-				test_net->Forward(bottom_vec, &iter_loss);
-		if (param_.test_compute_loss()) {
-			loss += iter_loss;
-		}
-		if (i == 0) {
-			for (int j = 0; j < result.size(); ++j) {
-				const Dtype* result_vec = result[j]->cpu_data();
-				for (int k = 0; k < result[j]->count(); ++k) {
-					test_score.push_back(result_vec[k]);
-					test_score_output_id.push_back(j);
-				}
-			}
-		} else {
-			int idx = 0;
-			for (int j = 0; j < result.size(); ++j) {
-				const Dtype* result_vec = result[j]->cpu_data();
-				for (int k = 0; k < result[j]->count(); ++k) {
-					test_score[idx++] += result_vec[k];
-				}
-			}
-		}
-	}
-	if (param_.test_compute_loss()) {
-		loss /= param_.test_iter(test_net_id);
-		LOG(INFO) << "Test loss: " << loss;
-	}
-	for (int i = 0; i < test_score.size(); ++i) {
-		const int output_blob_index =
-				test_net->output_blob_indices()[test_score_output_id[i]];
-		const string& output_name = test_net->blob_names()[output_blob_index];
-		const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
-		ostringstream loss_msg_stream;
-		const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
-		if (loss_weight) {
-			loss_msg_stream << " (* " << loss_weight
-					<< " = " << loss_weight * mean_score << " loss)";
-		}
-		LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-				<< mean_score << loss_msg_stream.str();
-	}
+  LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id
+      << ")";
+  CHECK_NOTNULL(test_nets_[test_net_id].get())->ShareTrainedLayersWith(
+      net_.get());
+  vector < Dtype > test_score;
+  vector<int> test_score_output_id;
+  vector<Blob<Dtype>*> bottom_vec;
+  const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
+  Dtype loss = 0;
+  for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
+    Dtype iter_loss;
+    const vector<Blob<Dtype>*>& result = test_net->Forward(bottom_vec,
+        &iter_loss);
+    if (param_.test_compute_loss()) {
+      loss += iter_loss;
+    }
+    if (i == 0) {
+      for (int j = 0; j < result.size(); ++j) {
+        const Dtype* result_vec = result[j]->cpu_data();
+        for (int k = 0; k < result[j]->count(); ++k) {
+          test_score.push_back(result_vec[k]);
+          test_score_output_id.push_back(j);
+        }
+      }
+    } else {
+      int idx = 0;
+      for (int j = 0; j < result.size(); ++j) {
+        const Dtype* result_vec = result[j]->cpu_data();
+        for (int k = 0; k < result[j]->count(); ++k) {
+          test_score[idx++] += result_vec[k];
+        }
+      }
+    }
+  }
+  if (param_.test_compute_loss()) {
+    loss /= param_.test_iter(test_net_id);
+    LOG(INFO) << "Test loss: " << loss;
+  }
+  for (int i = 0; i < test_score.size(); ++i) {
+    const int output_blob_index =
+        test_net->output_blob_indices()[test_score_output_id[i]];
+    const string& output_name = test_net->blob_names()[output_blob_index];
+    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
+    ostringstream loss_msg_stream;
+    const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
+    if (loss_weight) {
+      loss_msg_stream << " (* " << loss_weight << " = "
+          << loss_weight * mean_score << " loss)";
+    }
+    LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
+        << mean_score << loss_msg_stream.str();
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Snapshot() {
-	NetParameter net_param;
-	// For intermediate results, we will also dump the gradient values.
-	net_->ToProto(&net_param, param_.snapshot_diff());
-	string filename(param_.snapshot_prefix());
-	string model_filename, snapshot_filename;
-	const int kBufferSize = 20;
-	char iter_str_buffer[kBufferSize];
-	snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
-	filename += iter_str_buffer;
-	model_filename = filename + ".caffemodel";
-	LOG(INFO) << "Snapshotting to " << model_filename;
-	WriteProtoToBinaryFile(net_param, model_filename.c_str());
-	SolverState state;
-	SnapshotSolverState(&state);
-	state.set_iter(iter_);
-	state.set_learned_net(model_filename);
-	state.set_current_step(current_step_);
-	snapshot_filename = filename + ".solverstate";
-	LOG(INFO) << "Snapshotting solver state to " << snapshot_filename;
-	WriteProtoToBinaryFile(state, snapshot_filename.c_str());
+  NetParameter net_param;
+  // For intermediate results, we will also dump the gradient values.
+  net_->ToProto(&net_param, param_.snapshot_diff());
+  string filename(param_.snapshot_prefix());
+  string model_filename, snapshot_filename;
+  const int kBufferSize = 20;
+  char iter_str_buffer[kBufferSize];
+  snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
+  filename += iter_str_buffer;
+  model_filename = filename + ".caffemodel";
+  LOG(INFO) << "Snapshotting to " << model_filename;
+  WriteProtoToBinaryFile(net_param, model_filename.c_str());
+  SolverState state;
+  SnapshotSolverState(&state);
+  state.set_iter(iter_);
+  state.set_learned_net(model_filename);
+  state.set_current_step(current_step_);
+  snapshot_filename = filename + ".solverstate";
+  LOG(INFO) << "Snapshotting solver state to " << snapshot_filename;
+  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
-	SolverState state;
-	NetParameter net_param;
-	ReadProtoFromBinaryFile(state_file, &state);
-	if (state.has_learned_net()) {
-		ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
-		net_->CopyTrainedLayersFrom(net_param);
-	}
-	iter_ = state.iter();
-	current_step_ = state.current_step();
-	RestoreSolverState(state);
+  SolverState state;
+  NetParameter net_param;
+  ReadProtoFromBinaryFile(state_file, &state);
+  if (state.has_learned_net()) {
+    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
+    net_->CopyTrainedLayersFrom(net_param);
+  }
+  iter_ = state.iter();
+  current_step_ = state.current_step();
+  RestoreSolverState(state);
 }
 
 // Return the current learning rate. The currently implemented learning rate
@@ -419,382 +415,379 @@ void Solver<Dtype>::Restore(const char* state_file) {
 // in the solver parameter protocol buffer, and iter is the current iteration.
 template <typename Dtype>
 Dtype SGDSolver<Dtype>::GetLearningRate() {
-	Dtype rate;
-	const string& lr_policy = this->param_.lr_policy();
-	if (lr_policy == "fixed") {
-		rate = this->param_.base_lr();
-	} else if (lr_policy == "step") {
-		this->current_step_ = this->iter_ / this->param_.stepsize();
-		rate = this->param_.base_lr() *
-				pow(this->param_.gamma(), this->current_step_);
-	} else if (lr_policy == "exp") {
-		rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
-	} else if (lr_policy == "inv") {
-		rate = this->param_.base_lr() *
-				pow(Dtype(1) + this->param_.gamma() * this->iter_,
-						-this->param_.power());
-	} else if (lr_policy == "multistep") {
-		if (this->current_step_ < this->param_.stepvalue_size() &&
-				this->iter_ >= this->param_.stepvalue(this->current_step_)) {
-			this->current_step_++;
-			LOG(INFO) << "MultiStep Status: Iteration " <<
-					this->iter_ << ", step = " << this->current_step_;
-		}
-		rate = this->param_.base_lr() *
-				pow(this->param_.gamma(), this->current_step_);
-	} else if (lr_policy == "poly") {
-		rate = this->param_.base_lr() * pow(Dtype(1.) -
-				(Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-				this->param_.power());
-	} else if (lr_policy == "sigmoid") {
-		rate = this->param_.base_lr() * (Dtype(1.) /
-				(Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-						Dtype(this->param_.stepsize())))));
-	} else {
-		LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
-	}
-	return rate;
+  Dtype rate;
+  const string& lr_policy = this->param_.lr_policy();
+  if (lr_policy == "fixed") {
+    rate = this->param_.base_lr();
+  } else if (lr_policy == "step") {
+    this->current_step_ = this->iter_ / this->param_.stepsize();
+    rate = this->param_.base_lr()
+        * pow(this->param_.gamma(), this->current_step_);
+  } else if (lr_policy == "exp") {
+    rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
+  } else if (lr_policy == "inv") {
+    rate = this->param_.base_lr()
+        * pow(Dtype(1) + this->param_.gamma() * this->iter_,
+            -this->param_.power());
+  } else if (lr_policy == "multistep") {
+    if (this->current_step_ < this->param_.stepvalue_size()
+        && this->iter_ >= this->param_.stepvalue(this->current_step_)) {
+      this->current_step_++;
+      LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = "
+          << this->current_step_;
+    }
+    rate = this->param_.base_lr()
+        * pow(this->param_.gamma(), this->current_step_);
+  } else if (lr_policy == "poly") {
+    rate = this->param_.base_lr()
+        * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
+            this->param_.power());
+  } else if (lr_policy == "sigmoid") {
+    rate =
+        this->param_.base_lr()
+            * (Dtype(1.)
+                / (Dtype(1.)
+                    + exp(
+                        -this->param_.gamma()
+                            * (Dtype(this->iter_)
+                                - Dtype(this->param_.stepsize())))));
+  } else {
+    LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
+  }
+  return rate;
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::PreSolve() {
-	// Initialize the history
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	history_.clear();
-	update_.clear();
-	temp_.clear();
-	for (int i = 0; i < net_params.size(); ++i) {
-		const vector<int>& shape = net_params[i]->shape();
-		history_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
-		update_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
-		temp_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
-	}
+  // Initialize the history
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  history_.clear();
+  update_.clear();
+  temp_.clear();
+  for (int i = 0; i < net_params.size(); ++i) {
+    const vector<int>& shape = net_params[i]->shape();
+    history_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+    update_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+    temp_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ClipGradients() {
-	const Dtype clip_gradients = this->param_.clip_gradients();
-	if (clip_gradients < 0) {
-		return;
-	}
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	Dtype sumsq_diff = 0;
-	for (int i = 0; i < net_params.size(); ++i) {
-		if (this->net_->param_owners()[i] < 0) {
-			sumsq_diff += net_params[i]->sumsq_diff();
-		}
-	}
-	const Dtype l2norm_diff = std::sqrt(sumsq_diff);
-	if (l2norm_diff > clip_gradients) {
-		Dtype scale_factor = clip_gradients / l2norm_diff;
-		LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-				<< l2norm_diff << " > " << clip_gradients << ") "
-				<< "by scale factor " << scale_factor;
-		for (int i = 0; i < net_params.size(); ++i) {
-			if (this->net_->param_owners()[i] < 0) {
-				net_params[i]->scale_diff(scale_factor);
-			}
-		}
-	}
+  const Dtype clip_gradients = this->param_.clip_gradients();
+  if (clip_gradients < 0) {
+    return;
+  }
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  Dtype sumsq_diff = 0;
+  for (int i = 0; i < net_params.size(); ++i) {
+    if (this->net_->param_owners()[i] < 0) {
+      sumsq_diff += net_params[i]->sumsq_diff();
+    }
+  }
+  const Dtype l2norm_diff = std::sqrt(sumsq_diff);
+  if (l2norm_diff > clip_gradients) {
+    Dtype scale_factor = clip_gradients / l2norm_diff;
+    LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
+        << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor "
+        << scale_factor;
+    for (int i = 0; i < net_params.size(); ++i) {
+      if (this->net_->param_owners()[i] < 0) {
+        net_params[i]->scale_diff(scale_factor);
+      }
+    }
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
-	Dtype rate = GetLearningRate();
-	if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-		LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
-	}
-	ClipGradients();
-	for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
-		Normalize(param_id);
-		Regularize(param_id);
-		ComputeUpdateValue(param_id, rate);
-	}
-	this->net_->Update();
+  Dtype rate = GetLearningRate();
+  if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
+    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+  }
+  ClipGradients();
+  for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
+    Normalize(param_id);
+    Regularize(param_id);
+    ComputeUpdateValue(param_id, rate);
+  }
+  this->net_->Update();
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
-	if (this->param_.iter_size() == 1) {
-		return;
-	}
-	// Scale gradient to counterbalance accumulation.
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-	switch (Caffe::mode()) {
-		case Caffe::CPU: {
-			caffe_scal(net_params[param_id]->count(), accum_normalization,
-					net_params[param_id]->mutable_cpu_diff());
-			break;
-		}
-		case Caffe::GPU: {
+  if (this->param_.iter_size() == 1) {
+    return;
+  }
+  // Scale gradient to counterbalance accumulation.
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
 #ifndef CPU_ONLY
-			caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
-					net_params[param_id]->mutable_gpu_diff());
+    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_gpu_diff());
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		default:
-			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-	}
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Regularize(int param_id) {
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	const vector<float>& net_params_weight_decay =
-			this->net_->params_weight_decay();
-	Dtype weight_decay = this->param_.weight_decay();
-	string regularization_type = this->param_.regularization_type();
-	Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
-
-	switch (Caffe::mode()) {
-		case Caffe::CPU: {
-			if (local_decay) {
-				if (regularization_type == "L2") {
-					// add weight decay
-					caffe_axpy(net_params[param_id]->count(),
-							local_decay,
-							net_params[param_id]->cpu_data(),
-							net_params[param_id]->mutable_cpu_diff());
-				} else if (regularization_type == "L1") {
-					caffe_cpu_sign(net_params[param_id]->count(),
-							net_params[param_id]->cpu_data(),
-							temp_[param_id]->mutable_cpu_data());
-					caffe_axpy(net_params[param_id]->count(),
-							local_decay,
-							temp_[param_id]->cpu_data(),
-							net_params[param_id]->mutable_cpu_diff());
-				} else {
-					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-				}
-			}
-			break;
-		}
-		case Caffe::GPU: {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<float>& net_params_weight_decay =
+      this->net_->params_weight_decay();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    if (local_decay) {
+      if (regularization_type == "L2") {
+        // add weight decay
+        caffe_axpy(net_params[param_id]->count(), local_decay,
+            net_params[param_id]->cpu_data(),
+            net_params[param_id]->mutable_cpu_diff());
+      } else if (regularization_type == "L1") {
+        caffe_cpu_sign(net_params[param_id]->count(),
+            net_params[param_id]->cpu_data(),
+            temp_[param_id]->mutable_cpu_data());
+        caffe_axpy(net_params[param_id]->count(), local_decay,
+            temp_[param_id]->cpu_data(),
+            net_params[param_id]->mutable_cpu_diff());
+      } else {
+        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+      }
+    }
+    break;
+  }
+  case Caffe::GPU: {
 #ifndef CPU_ONLY
-			if (local_decay) {
-				if (regularization_type == "L2") {
-					// add weight decay
-					caffe_gpu_axpy(net_params[param_id]->count(),
-							local_decay,
-							net_params[param_id]->gpu_data(),
-							net_params[param_id]->mutable_gpu_diff());
-				} else if (regularization_type == "L1") {
-					caffe_gpu_sign(net_params[param_id]->count(),
-							net_params[param_id]->gpu_data(),
-							temp_[param_id]->mutable_gpu_data());
-					caffe_gpu_axpy(net_params[param_id]->count(),
-							local_decay,
-							temp_[param_id]->gpu_data(),
-							net_params[param_id]->mutable_gpu_diff());
-				} else {
-					LOG(FATAL) << "Unknown regularization type: " << regularization_type;
-				}
-			}
+    if (local_decay) {
+      if (regularization_type == "L2") {
+        // add weight decay
+        caffe_gpu_axpy(net_params[param_id]->count(), local_decay,
+            net_params[param_id]->gpu_data(),
+            net_params[param_id]->mutable_gpu_diff());
+      } else if (regularization_type == "L1") {
+        caffe_gpu_sign(net_params[param_id]->count(),
+            net_params[param_id]->gpu_data(),
+            temp_[param_id]->mutable_gpu_data());
+        caffe_gpu_axpy(net_params[param_id]->count(), local_decay,
+            temp_[param_id]->gpu_data(),
+            net_params[param_id]->mutable_gpu_diff());
+      } else {
+        LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+      }
+    }
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		default:
-			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-	}
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	const vector<float>& net_params_lr = this->net_->params_lr();
-	Dtype momentum = this->param_.momentum();
-	Dtype local_rate = rate * net_params_lr[param_id];
-	// Compute the update to history, then copy it to the parameter diff.
-	switch (Caffe::mode()) {
-		case Caffe::CPU: {
-			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-					net_params[param_id]->cpu_diff(), momentum,
-					history_[param_id]->mutable_cpu_data());
-			caffe_copy(net_params[param_id]->count(),
-					history_[param_id]->cpu_data(),
-					net_params[param_id]->mutable_cpu_diff());
-			break;
-		}
-		case Caffe::GPU: {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  // Compute the update to history, then copy it to the parameter diff.
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->cpu_diff(), momentum,
+        history_[param_id]->mutable_cpu_data());
+    caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
 #ifndef CPU_ONLY
-			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-					net_params[param_id]->gpu_diff(), momentum,
-					history_[param_id]->mutable_gpu_data());
-			caffe_gpu_copy(net_params[param_id]->count(),
-					history_[param_id]->gpu_data(),
-					net_params[param_id]->mutable_gpu_diff());
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->gpu_diff(), momentum,
+        history_[param_id]->mutable_gpu_data());
+    caffe_gpu_copy(net_params[param_id]->count(),
+        history_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		default:
-			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-	}
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
-	state->clear_history();
-	for (int i = 0; i < history_.size(); ++i) {
-		// Add history
-		BlobProto* history_blob = state->add_history();
-		history_[i]->ToProto(history_blob);
-	}
+  state->clear_history();
+  for (int i = 0; i < history_.size(); ++i) {
+    // Add history
+    BlobProto* history_blob = state->add_history();
+    history_[i]->ToProto(history_blob);
+  }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
-	CHECK_EQ(state.history_size(), history_.size())
-			<< "Incorrect length of history blobs.";
-	LOG(INFO) << "SGDSolver: restoring history";
-	for (int i = 0; i < history_.size(); ++i) {
-		history_[i]->FromProto(state.history(i));
-	}
+  CHECK_EQ(state.history_size(), history_.size())
+      << "Incorrect length of history blobs.";
+  LOG(INFO) << "SGDSolver: restoring history";
+  for (int i = 0; i < history_.size(); ++i) {
+    history_[i]->FromProto(state.history(i));
+  }
 }
 
 template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	const vector<float>& net_params_lr = this->net_->params_lr();
-	Dtype momentum = this->param_.momentum();
-	Dtype local_rate = rate * net_params_lr[param_id];
-	switch (Caffe::mode()) {
-		case Caffe::CPU: {
-			// save history momentum for stepping back
-			caffe_copy(net_params[param_id]->count(),
-					this->history_[param_id]->cpu_data(),
-					this->update_[param_id]->mutable_cpu_data());
-
-			// update history
-			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-					net_params[param_id]->cpu_diff(), momentum,
-					this->history_[param_id]->mutable_cpu_data());
-
-			// compute update: step back then over step
-			caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-					this->history_[param_id]->cpu_data(), -momentum,
-					this->update_[param_id]->mutable_cpu_data());
-
-			// copy
-			caffe_copy(net_params[param_id]->count(),
-					this->update_[param_id]->cpu_data(),
-					net_params[param_id]->mutable_cpu_diff());
-			break;
-		}
-		case Caffe::GPU: {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // save history momentum for stepping back
+    caffe_copy(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->cpu_diff(), momentum,
+        this->history_[param_id]->mutable_cpu_data());
+
+    // compute update: step back then over step
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+        this->history_[param_id]->cpu_data(), -momentum,
+        this->update_[param_id]->mutable_cpu_data());
+
+    // copy
+    caffe_copy(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
 #ifndef CPU_ONLY
-			// save history momentum for stepping back
-			caffe_copy(net_params[param_id]->count(),
-					this->history_[param_id]->gpu_data(),
-					this->update_[param_id]->mutable_gpu_data());
-
-			// update history
-			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-					net_params[param_id]->gpu_diff(), momentum,
-					this->history_[param_id]->mutable_gpu_data());
-
-			// compute update: step back then over step
-			caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-					this->history_[param_id]->gpu_data(), -momentum,
-					this->update_[param_id]->mutable_gpu_data());
-
-			// copy
-			caffe_gpu_copy(net_params[param_id]->count(),
-					this->update_[param_id]->gpu_data(),
-					net_params[param_id]->mutable_gpu_diff());
+    // save history momentum for stepping back
+    caffe_copy(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->gpu_diff(), momentum,
+        this->history_[param_id]->mutable_gpu_data());
+
+    // compute update: step back then over step
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
+        this->history_[param_id]->gpu_data(), -momentum,
+        this->update_[param_id]->mutable_gpu_data());
+
+    // copy
+    caffe_gpu_copy(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		default:
-			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-	}
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
 }
 
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-	const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
-	const vector<float>& net_params_lr = this->net_->params_lr();
-	Dtype delta = this->param_.delta();
-	Dtype local_rate = rate * net_params_lr[param_id];
-	switch (Caffe::mode()) {
-		case Caffe::CPU: {
-			// compute square of gradient in update
-			caffe_powx(net_params[param_id]->count(),
-					net_params[param_id]->cpu_diff(), Dtype(2),
-					this->update_[param_id]->mutable_cpu_data());
-
-			// update history
-			caffe_add(net_params[param_id]->count(),
-					this->update_[param_id]->cpu_data(),
-					this->history_[param_id]->cpu_data(),
-					this->history_[param_id]->mutable_cpu_data());
-
-			// prepare update
-			caffe_powx(net_params[param_id]->count(),
-					this->history_[param_id]->cpu_data(), Dtype(0.5),
-					this->update_[param_id]->mutable_cpu_data());
-
-			caffe_add_scalar(net_params[param_id]->count(),
-					delta, this->update_[param_id]->mutable_cpu_data());
-
-			caffe_div(net_params[param_id]->count(),
-					net_params[param_id]->cpu_diff(),
-					this->update_[param_id]->cpu_data(),
-					this->update_[param_id]->mutable_cpu_data());
-
-			// scale and copy
-			caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-					this->update_[param_id]->cpu_data(), Dtype(0),
-					net_params[param_id]->mutable_cpu_diff());
-			break;
-		}
-		case Caffe::GPU: {
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype delta = this->param_.delta();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(),
+        Dtype(2), this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_add(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        this->history_[param_id]->cpu_data(),
+        this->history_[param_id]->mutable_cpu_data());
+
+    // prepare update
+    caffe_powx(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add_scalar(net_params[param_id]->count(), delta,
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(),
+        this->update_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // scale and copy
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->cpu_data(), Dtype(0),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
 #ifndef CPU_ONLY
-			// compute square of gradient in update
-			caffe_gpu_powx(net_params[param_id]->count(),
-					net_params[param_id]->gpu_diff(), Dtype(2),
-					this->update_[param_id]->mutable_gpu_data());
-
-			// update history
-			caffe_gpu_add(net_params[param_id]->count(),
-					this->update_[param_id]->gpu_data(),
-					this->history_[param_id]->gpu_data(),
-					this->history_[param_id]->mutable_gpu_data());
-
-			// prepare update
-			caffe_gpu_powx(net_params[param_id]->count(),
-					this->history_[param_id]->gpu_data(), Dtype(0.5),
-					this->update_[param_id]->mutable_gpu_data());
-
-			caffe_gpu_add_scalar < Dtype > (net_params[param_id]->count(),
-					delta, this->update_[param_id]->mutable_gpu_data());
-
-			caffe_gpu_div(net_params[param_id]->count(),
-					net_params[param_id]->gpu_diff(),
-					this->update_[param_id]->gpu_data(),
-					this->update_[param_id]->mutable_gpu_data());
-
-			// scale and copy
-			caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-					this->update_[param_id]->gpu_data(), Dtype(0),
-					net_params[param_id]->mutable_gpu_diff());
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        this->history_[param_id]->gpu_data(),
+        this->history_[param_id]->mutable_gpu_data());
+
+    // prepare update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add_scalar < Dtype
+        > (net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_div(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // scale and copy
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->gpu_data(), Dtype(0),
+        net_params[param_id]->mutable_gpu_diff());
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		default:
-			LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-	}
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
 }
 
 INSTANTIATE_CLASS (Solver);
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 976130bf..db470434 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -36,150 +36,149 @@
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
-	if (cpu_ptr_ && own_cpu_data_) {
-		OCL_CHECK(
-				clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
-						cpu_ptr_, 0, NULL, NULL));
-		clFinish(amdDevice.CommandQueue);
-	}
-	if (gpu_cache_ptr_ && own_cpu_data_) {
-		OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_));
-	}
-	if (gpu_ptr_) {
-		OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_));
-	}
-
-	clReleaseKernel (oclmem_kernel);
+  if (cpu_ptr_ && own_cpu_data_) {
+    OCL_CHECK(
+        clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+            cpu_ptr_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
+  }
+  if (gpu_cache_ptr_ && own_cpu_data_) {
+    OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_));
+  }
+  if (gpu_ptr_) {
+    OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_));
+  }
+
+  clReleaseKernel (oclmem_kernel);
 }
 
 //begin: code written/modified by AMD.
 void SyncedMemory::ocl_setup() {
-	cl_int err = 0;
-	oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
-	OCL_CHECK(err);
+  cl_int err = 0;
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+  OCL_CHECK(err);
 }
 
 inline void SyncedMemory::to_cpu() {
-	switch (head_) {
-		case UNINITIALIZED:
-			gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
-					size_, NULL, NULL);
-			cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
-					(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
-					size_,
-					0, NULL, NULL, NULL);
-			memset(cpu_ptr_, 0, size_);
-			head_ = HEAD_AT_CPU;
-			own_cpu_data_ = true;
-			break;
-		case HEAD_AT_GPU: {
+  switch (head_) {
+  case UNINITIALIZED:
+    gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+        size_, NULL, NULL);
+    cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+        (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_,
+        0, NULL, NULL, NULL);
+    memset(cpu_ptr_, 0, size_);
+    head_ = HEAD_AT_CPU;
+    own_cpu_data_ = true;
+    break;
+  case HEAD_AT_GPU: {
 #ifndef CPU_ONLY
-			if (cpu_ptr_ == NULL) {
-				gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context,
-						CL_MEM_ALLOC_HOST_PTR, size_, NULL, NULL);
-				cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
-						(cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
-						size_, 0, NULL, NULL, NULL);
-				own_cpu_data_ = true;
-			}
-			OCL_CHECK(
-					clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
-							(cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
-			clFinish(amdDevice.CommandQueue);
-			head_ = SYNCED;
+    if (cpu_ptr_ == NULL) {
+      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+          size_, NULL, NULL);
+      cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+          (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+          size_, 0, NULL, NULL, NULL);
+      own_cpu_data_ = true;
+    }
+    OCL_CHECK(
+        clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
+            (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
+    head_ = SYNCED;
 #else
-			NO_GPU;
+    NO_GPU;
 #endif
-			break;
-		}
-		case HEAD_AT_CPU:
-			case SYNCED:
-			break;
-	}
+    break;
+  }
+  case HEAD_AT_CPU:
+  case SYNCED:
+    break;
+  }
 }
 
 inline void SyncedMemory::to_gpu() {
 #ifndef CPU_ONLY
-	switch (head_) {
-		case UNINITIALIZED: {
-			cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-					size_, NULL, NULL);
-			if (NULL == tmpMem) {
-				fprintf(stderr, "Failed to create memory object\n");
-				break;
-			}
-			ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int)));
-			gpu_ptr_ = (void*) tmpMem;
-			head_ = HEAD_AT_GPU;
-			break;
-		}
-		case HEAD_AT_CPU: {
-			if (gpu_ptr_ == NULL) {
-				cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-						size_, NULL, NULL);
-				if (NULL == tmpMem) {
-					fprintf(stderr, "Failed to create memory object\n");
-				}
-				gpu_ptr_ = (void*) tmpMem;
-			}
-			OCL_CHECK(
-					clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
-							(cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
-			clFinish(amdDevice.CommandQueue);
-			head_ = SYNCED;
-			break;
-		}
-		case HEAD_AT_GPU:
-			case SYNCED:
-			break;
-	}
+  switch (head_) {
+  case UNINITIALIZED: {
+    cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_,
+        NULL, NULL);
+    if (NULL == tmpMem) {
+      fprintf(stderr, "Failed to create memory object\n");
+      break;
+    }
+    ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int)));
+    gpu_ptr_ = (void*) tmpMem;
+    head_ = HEAD_AT_GPU;
+    break;
+  }
+  case HEAD_AT_CPU: {
+    if (gpu_ptr_ == NULL) {
+      cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+          size_, NULL, NULL);
+      if (NULL == tmpMem) {
+        fprintf(stderr, "Failed to create memory object\n");
+      }
+      gpu_ptr_ = (void*) tmpMem;
+    }
+    OCL_CHECK(
+        clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+            (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
+    head_ = SYNCED;
+    break;
+  }
+  case HEAD_AT_GPU:
+  case SYNCED:
+    break;
+  }
 #else
-	NO_GPU;
+  NO_GPU;
 #endif
 }
 
 const void* SyncedMemory::cpu_data() {
-	to_cpu();
-	return (const void*) cpu_ptr_;
+  to_cpu();
+  return (const void*) cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
-	CHECK(data);
-	if (own_cpu_data_) {
-		CaffeFreeHost (cpu_ptr_);
-	}
-	cpu_ptr_ = data;
-	head_ = HEAD_AT_CPU;
-	own_cpu_data_ = false;
+  CHECK(data);
+  if (own_cpu_data_) {
+    CaffeFreeHost (cpu_ptr_);
+  }
+  cpu_ptr_ = data;
+  head_ = HEAD_AT_CPU;
+  own_cpu_data_ = false;
 }
 
 const void* SyncedMemory::gpu_data() {
 #ifndef CPU_ONLY
-	to_gpu();
-	return (const void*) gpu_ptr_;
+  to_gpu();
+  return (const void*) gpu_ptr_;
 #else
-	NO_GPU;
+  NO_GPU;
 #endif
 }
 
 void* SyncedMemory::mutable_cpu_data() {
-	to_cpu();
-	head_ = HEAD_AT_CPU;
-	return cpu_ptr_;
+  to_cpu();
+  head_ = HEAD_AT_CPU;
+  return cpu_ptr_;
 }
 
 void* SyncedMemory::mutable_gpu_data() {
 #ifndef CPU_ONLY
-	to_gpu();
-	head_ = HEAD_AT_GPU;
-	return gpu_ptr_;
+  to_gpu();
+  head_ = HEAD_AT_GPU;
+  return gpu_ptr_;
 #else
-	NO_GPU;
+  NO_GPU;
 #endif
 }
 
 const void *SyncedMemory::gpu_cache_data() {
-	return 0;
+  return 0;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 4c0ce04e..2dcf0e5a 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -6,114 +6,111 @@
 namespace caffe {
 
 Timer::Timer()
-:
-		initted_(false),
-				running_(false),
-				has_run_at_least_once_(false) {
-	Init();
+    : initted_(false), running_(false), has_run_at_least_once_(false) {
+  Init();
 }
 
 Timer::~Timer() {
 }
 
 void Timer::Start() {
-	if (!running()) {
-		start_cpu_ = boost::posix_time::microsec_clock::local_time();
-		running_ = true;
-		has_run_at_least_once_ = true;
-	}
+  if (!running()) {
+    start_cpu_ = boost::posix_time::microsec_clock::local_time();
+    running_ = true;
+    has_run_at_least_once_ = true;
+  }
 }
 
 void Timer::Stop() {
-	if (running()) {
-		stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-		running_ = false;
-	}
+  if (running()) {
+    stop_cpu_ = boost::posix_time::microsec_clock::local_time();
+    running_ = false;
+  }
 }
 
 float Timer::MicroSeconds() {
-	if (!has_run_at_least_once()) {
-		LOG(WARNING) << "Timer has never been run before reading time.";
-		return 0;
-	}
-	if (running()) {
-		Stop();
-	}
-
-	elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
-	return elapsed_microseconds_;
+  if (!has_run_at_least_once()) {
+    LOG(WARNING) << "Timer has never been run before reading time.";
+    return 0;
+  }
+  if (running()) {
+    Stop();
+  }
+
+  elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
+  return elapsed_microseconds_;
 }
 
 float Timer::MilliSeconds() {
-	if (!has_run_at_least_once()) {
-		LOG(WARNING) << "Timer has never been run before reading time.";
-		return 0;
-	}
-	if (running()) {
-		Stop();
-	}
-
-	elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
-	return elapsed_milliseconds_;
+  if (!has_run_at_least_once()) {
+    LOG(WARNING) << "Timer has never been run before reading time.";
+    return 0;
+  }
+  if (running()) {
+    Stop();
+  }
+
+  elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
+  return elapsed_milliseconds_;
 }
 
 float Timer::Seconds() {
-	return MilliSeconds() / 1000.;
+  return MilliSeconds() / 1000.;
 }
 
 void Timer::Init() {
-	if (!initted()) {
-		if (Caffe::mode() == Caffe::GPU) {
-		}
-		initted_ = true;
-	}
+  if (!initted()) {
+    if (Caffe::mode() == Caffe::GPU) {
+    }
+    initted_ = true;
+  }
 }
 
 CPUTimer::CPUTimer() {
-	this->initted_ = true;
-	this->running_ = false;
-	this->has_run_at_least_once_ = false;
+  this->initted_ = true;
+  this->running_ = false;
+  this->has_run_at_least_once_ = false;
 }
 
 void CPUTimer::Start() {
-	if (!running()) {
-		this->start_cpu_ = boost::posix_time::microsec_clock::local_time();
-		this->running_ = true;
-		this->has_run_at_least_once_ = true;
-	}
+  if (!running()) {
+    this->start_cpu_ = boost::posix_time::microsec_clock::local_time();
+    this->running_ = true;
+    this->has_run_at_least_once_ = true;
+  }
 }
 
 void CPUTimer::Stop() {
-	if (running()) {
-		this->stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-		this->running_ = false;
-	}
+  if (running()) {
+    this->stop_cpu_ = boost::posix_time::microsec_clock::local_time();
+    this->running_ = false;
+  }
 }
 
 float CPUTimer::MilliSeconds() {
-	if (!has_run_at_least_once()) {
-		LOG(WARNING) << "Timer has never been run before reading time.";
-		return 0;
-	}
-	if (running()) {
-		Stop();
-	}
-	this->elapsed_milliseconds_ = (this->stop_cpu_ -
-			this->start_cpu_).total_milliseconds();
-	return this->elapsed_milliseconds_;
+  if (!has_run_at_least_once()) {
+    LOG(WARNING) << "Timer has never been run before reading time.";
+    return 0;
+  }
+  if (running()) {
+    Stop();
+  }
+  this->elapsed_milliseconds_ =
+      (this->stop_cpu_ - this->start_cpu_).total_milliseconds();
+  return this->elapsed_milliseconds_;
 }
 
 float CPUTimer::MicroSeconds() {
-	if (!has_run_at_least_once()) {
-		LOG(WARNING) << "Timer has never been run before reading time.";
-		return 0;
-	}
-	if (running()) {
-		Stop();
-	}
-	this->elapsed_microseconds_ = (this->stop_cpu_ -
-			this->start_cpu_).total_microseconds();
-	return this->elapsed_microseconds_;
+  if (!has_run_at_least_once()) {
+    LOG(WARNING) << "Timer has never been run before reading time.";
+    return 0;
+  }
+  if (running()) {
+    Stop();
+  }
+  this->elapsed_microseconds_ =
+      (this->stop_cpu_ - this->start_cpu_).total_microseconds();
+  return this->elapsed_microseconds_;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp
index 43492ce7..592017c5 100644
--- a/src/caffe/util/cudnn.cpp
+++ b/src/caffe/util/cudnn.cpp
@@ -2,22 +2,22 @@
 #include "caffe/util/cudnn.hpp"
 
 namespace caffe {
-	namespace cudnn {
+  namespace cudnn {
 
-		float dataType<float>::oneval = 1.0;
-		float dataType<float>::zeroval = 0.0;
-		const void* dataType<float>::one =
-		static_cast<void *>(&dataType<float>::oneval);
-		const void* dataType<float>::zero =
-		static_cast<void *>(&dataType<float>::zeroval);
+    float dataType<float>::oneval = 1.0;
+    float dataType<float>::zeroval = 0.0;
+    const void* dataType<float>::one =
+    static_cast<void *>(&dataType<float>::oneval);
+    const void* dataType<float>::zero =
+    static_cast<void *>(&dataType<float>::zeroval);
 
-		double dataType<double>::oneval = 1.0;
-		double dataType<double>::zeroval = 0.0;
-		const void* dataType<double>::one =
-		static_cast<void *>(&dataType<double>::oneval);
-		const void* dataType<double>::zero =
-		static_cast<void *>(&dataType<double>::zeroval);
+    double dataType<double>::oneval = 1.0;
+    double dataType<double>::zeroval = 0.0;
+    const void* dataType<double>::one =
+    static_cast<void *>(&dataType<double>::oneval);
+    const void* dataType<double>::zero =
+    static_cast<void *>(&dataType<double>::zeroval);
 
-	}  // namespace cudnn
+  }  // namespace cudnn
 }  // namespace caffe
 #endif
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
index 50d8cbf7..fd4de1bf 100644
--- a/src/caffe/util/db.cpp
+++ b/src/caffe/util/db.cpp
@@ -8,24 +8,24 @@ namespace caffe {
 namespace db {
 
 DB* GetDB(DataParameter::DB backend) {
-	switch (backend) {
-		case DataParameter_DB_LEVELDB:
-			return new LevelDB();
-		case DataParameter_DB_LMDB:
-			return new LMDB();
-		default:
-			LOG(FATAL) << "Unknown database backend";
-	}
+  switch (backend) {
+  case DataParameter_DB_LEVELDB:
+    return new LevelDB();
+  case DataParameter_DB_LMDB:
+    return new LMDB();
+  default:
+    LOG(FATAL) << "Unknown database backend";
+  }
 }
 
 DB* GetDB(const string& backend) {
-	if (backend == "leveldb") {
-		return new LevelDB();
-	} else if (backend == "lmdb") {
-		return new LMDB();
-	} else {
-		LOG(FATAL) << "Unknown database backend";
-	}
+  if (backend == "leveldb") {
+    return new LevelDB();
+  } else if (backend == "lmdb") {
+    return new LMDB();
+  } else {
+    LOG(FATAL) << "Unknown database backend";
+  }
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index d8adce8a..d8eac5f7 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -6,16 +6,16 @@ namespace caffe {
 namespace db {
 
 void LevelDB::Open(const string& source, Mode mode) {
-	leveldb::Options options;
-	options.block_size = 65536;
-	options.write_buffer_size = 268435456;
-	options.max_open_files = 100;
-	options.error_if_exists = mode == NEW;
-	options.create_if_missing = mode != READ;
-	leveldb::Status status = leveldb::DB::Open(options, source, &db_);
-	CHECK(status.ok()) << "Failed to open leveldb " << source
-			<< std::endl << status.ToString();
-	LOG(INFO) << "Opened leveldb " << source;
+  leveldb::Options options;
+  options.block_size = 65536;
+  options.write_buffer_size = 268435456;
+  options.max_open_files = 100;
+  options.error_if_exists = mode == NEW;
+  options.create_if_missing = mode != READ;
+  leveldb::Status status = leveldb::DB::Open(options, source, &db_);
+  CHECK(status.ok()) << "Failed to open leveldb " << source << std::endl
+      << status.ToString();
+  LOG(INFO) << "Opened leveldb " << source;
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index bc1a0da1..126b3790 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -10,42 +10,42 @@ namespace db {
 const size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
 
 void LMDB::Open(const string& source, Mode mode) {
-	MDB_CHECK(mdb_env_create(&mdb_env_));
-	MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
-	if(mode == NEW) {
-		CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
-	}
-	int flags = 0;
-	if (mode == READ) {
-		flags = MDB_RDONLY | MDB_NOTLS;
-	}
-	MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
-	LOG(INFO) << "Opened lmdb " << source;
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));if
+(  mode == NEW) {
+    CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
+  }
+  int flags = 0;
+  if (mode == READ) {
+    flags = MDB_RDONLY | MDB_NOTLS;
+  }
+  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+  LOG(INFO) << "Opened lmdb " << source;
 }
 
 LMDBCursor* LMDB::NewCursor() {
-	MDB_txn* mdb_txn;
-	MDB_cursor* mdb_cursor;
-	MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn));
-	MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-	MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor));
-	return new LMDBCursor(mdb_txn, mdb_cursor);
+  MDB_txn* mdb_txn;
+  MDB_cursor* mdb_cursor;
+  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn));
+  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
+  MDB_CHECK(mdb_cursor_open(mdb_txn, mdb_dbi_, &mdb_cursor));
+  return new LMDBCursor(mdb_txn, mdb_cursor);
 }
 
 LMDBTransaction* LMDB::NewTransaction() {
-	MDB_txn* mdb_txn;
-	MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
-	MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
-	return new LMDBTransaction(&mdb_dbi_, mdb_txn);
+  MDB_txn* mdb_txn;
+  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
+  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi_));
+  return new LMDBTransaction(&mdb_dbi_, mdb_txn);
 }
 
 void LMDBTransaction::Put(const string& key, const string& value) {
-	MDB_val mdb_key, mdb_value;
-	mdb_key.mv_data = const_cast<char*>(key.data());
-	mdb_key.mv_size = key.size();
-	mdb_value.mv_data = const_cast<char*>(value.data());
-	mdb_value.mv_size = value.size();
-	MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0));
+  MDB_val mdb_key, mdb_value;
+  mdb_key.mv_data = const_cast<char*>(key.data());
+  mdb_key.mv_size = key.size();
+  mdb_value.mv_data = const_cast<char*>(value.data());
+  mdb_value.mv_size = value.size();
+  MDB_CHECK(mdb_put(mdb_txn_, *mdb_dbi_, &mdb_key, &mdb_value, 0));
 }
 
 }  // namespace db
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 886ac85b..25349d26 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -37,350 +37,334 @@ namespace caffe {
 template <typename dtype> extern std::string get_dtype_suffix();
 
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_col) {
-	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-	int channels_col = channels * kernel_h * kernel_w;
-	for (int c = 0; c < channels_col; ++c) {
-		int w_offset = c % kernel_w;
-		int h_offset = (c / kernel_w) % kernel_h;
-		int c_im = c / kernel_h / kernel_w;
-		for (int h = 0; h < height_col; ++h) {
-			for (int w = 0; w < width_col; ++w) {
-				int h_pad = h * stride_h - pad_h + h_offset;
-				int w_pad = w * stride_w - pad_w + w_offset;
-				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-					data_col[(c * height_col + h) * width_col + w] =
-							data_im[(c_im * height + h_pad) * width + w_pad];
-				else
-					data_col[(c * height_col + h) * width_col + w] = 0;
-			}
-		}
-	}
+void im2col_cpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_col[(c * height_col + h) * width_col + w] = data_im[(c_im
+              * height + h_pad) * width + w_pad];
+        else
+          data_col[(c * height_col + h) * width_col + w] = 0;
+      }
+    }
+  }
 }
 
 template void im2col_cpu<float>(const float* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, float* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, double* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_col);
 
 template <typename Dtype>
-void col2im_cpu(const Dtype* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_im) {
-	caffe_set(height * width * channels, Dtype(0), data_im);
-	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-	int channels_col = channels * patch_h * patch_w;
-	for (int c = 0; c < channels_col; ++c) {
-		int w_offset = c % patch_w;
-		int h_offset = (c / patch_w) % patch_h;
-		int c_im = c / patch_h / patch_w;
-		for (int h = 0; h < height_col; ++h) {
-			for (int w = 0; w < width_col; ++w) {
-				int h_pad = h * stride_h - pad_h + h_offset;
-				int w_pad = w * stride_w - pad_w + w_offset;
-				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-					data_im[(c_im * height + h_pad) * width + w_pad] +=
-							data_col[(c * height_col + h) * width_col + w];
-			}
-		}
-	}
+void col2im_cpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
+  caffe_set(height * width * channels, Dtype(0), data_im);
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_h * patch_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % patch_w;
+    int h_offset = (c / patch_w) % patch_h;
+    int c_im = c / patch_h / patch_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_im[(c_im * height + h_pad) * width + w_pad] += data_col[(c
+              * height_col + h) * width_col + w];
+      }
+    }
+  }
 }
 
 template void col2im_cpu<float>(const float* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, float* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, double* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_im);
 
 template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_im, const int img_offset, int optnum) {
-	std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	int height_col = (height + 2 * pad - ksize) / stride + 1;
-	int width_col = (width + 2 * pad - ksize) / stride + 1;
-	int num_kernels = channels * height * width;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_im, const int img_offset,
+    int optnum) {
+  std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int num_kernels = channels * height * width;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu_opt<float>(const float* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, float* data_im, const int img_offset, int optnum);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, float* data_im, const int img_offset,
+    int optnum);
 template void col2im_gpu_opt<double>(const double* data_col,
-		const int col_offset, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, double* data_im, const int img_offset, int optnum);
+    const int col_offset, const int channels, const int height, const int width,
+    const int ksize, const int pad, const int stride, double* data_im,
+    const int img_offset, int optnum);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_col, const int col_offset)
-		{
-	std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-	int num_kernels = channels * height_col * width_col;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w);
-
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset) {
+  std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w);
+
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 
 }
 
 template void im2col_gpu<float>(const float* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		float* data_col, const int col_offset);
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_col, const int col_offset);
 template void im2col_gpu<double>(const double* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		double* data_col, const int col_offset);
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_col, const int col_offset);
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_im, const int img_offset)
-		{
-	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-	int num_kernels = channels * height * width;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
+    const int width, const int channels, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset) {
+  std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int num_kernels = channels * height * width;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w, const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w, float* data_im,
-		const int img_offset);
+    const int height, const int width, const int channels, const int patch_h,
+    const int patch_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_im, const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		double* data_im, const int img_offset);
+    const int height, const int width, const int channels, const int patch_h,
+    const int patch_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_im, const int img_offset);
 
 template <typename Dtype>
 void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, const int col_offset) {
-
-	int height_col = (height + 2 * pad - ksize) / stride + 1;
-	int width_col = (width + 2 * pad - ksize) / stride + 1;
-	int num_kernels = channels * height_col * width_col;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-	clFinish(amdDevice.CommandQueue);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_col, const int col_offset) {
+
+  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int num_kernels = channels * height_col * width_col;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+  clFinish(amdDevice.CommandQueue);
 }
 
 template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im,
-		const int img_offset, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, float* data_col, const int col_offset);
+    const int img_offset, const int channels, const int height, const int width,
+    const int ksize, const int pad, const int stride, float* data_col,
+    const int col_offset);
 template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im,
-		const int img_offset, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, double* data_col, const int col_offset);
+    const int img_offset, const int channels, const int height, const int width,
+    const int ksize, const int pad, const int stride, double* data_col,
+    const int col_offset);
 
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_col, const int col_offset, int optnum) {
-
-	std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int height_col = (height + 2 * pad - ksize) / stride + 1;
-	int width_col = (width + 2 * pad - ksize) / stride + 1;
-	int num_kernels = optnum * channels * height_col * width_col;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, Dtype* data_col, const int col_offset,
+    int optnum) {
+
+  std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int num_kernels = optnum * channels * height_col * width_col;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void im2col_gpu_opt<float>(const float* data_im, const int img_offset,
-		const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, float* data_col, const int col_offset, int optnum);
+    const int channels, const int height, const int width, const int ksize,
+    const int pad, const int stride, float* data_col, const int col_offset,
+    int optnum);
 template void im2col_gpu_opt<double>(const double* data_im,
-		const int img_offset, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, double* data_col, const int col_offset, int optnum);
+    const int img_offset, const int channels, const int height, const int width,
+    const int ksize, const int pad, const int stride, double* data_col,
+    const int col_offset, int optnum);
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
-		const int height, const int width, const int ksize, const int pad,
-		const int stride, Dtype* data_im, const int img_offset) {
-	std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int height_col = (height + 2 * pad - ksize) / stride + 1;
-	int width_col = (width + 2 * pad - ksize) / stride + 1;
-	int num_kernels = channels * height * width;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int height, const int width, const int ksize, const int pad,
+    const int stride, Dtype* data_im, const int img_offset) {
+  std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int num_kernels = channels * height * width;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int psize, const int pad,
-		const int stride, float* data_im, const int img_offset);
+    const int channels, const int height, const int width, const int psize,
+    const int pad, const int stride, float* data_im, const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-		const int channels,
-		const int height, const int width, const int psize, const int pad,
-		const int stride, double* data_im, const int img_offset);
+    const int channels, const int height, const int width, const int psize,
+    const int pad, const int stride, double* data_im, const int img_offset);
 
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index 6435427e..0848017a 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -10,124 +10,121 @@ namespace caffe {
 
 template <typename Dtype>
 __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		const int height_col, const int width_col,
-		Dtype* data_col) {
-	CUDA_KERNEL_LOOP(index, n) {
-		int w_out = index % width_col;
-		int h_index = index / width_col;
-		int h_out = h_index % height_col;
-		int channel_in = h_index / height_col;
-		int channel_out = channel_in * kernel_h * kernel_w;
-		int h_in = h_out * stride_h - pad_h;
-		int w_in = w_out * stride_w - pad_w;
-		Dtype* data_col_ptr = data_col;
-		data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-		const Dtype* data_im_ptr = data_im;
-		data_im_ptr += (channel_in * height + h_in) * width + w_in;
-		for (int i = 0; i < kernel_h; ++i) {
-			for (int j = 0; j < kernel_w; ++j) {
-				int h = h_in + i;
-				int w = w_in + j;
-				*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-				data_im_ptr[i * width + j] : 0;
-				data_col_ptr += height_col * width_col;
-			}
-		}
-	}
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    Dtype* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int w_out = index % width_col;
+    int h_index = index / width_col;
+    int h_out = h_index % height_col;
+    int channel_in = h_index / height_col;
+    int channel_out = channel_in * kernel_h * kernel_w;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+    Dtype* data_col_ptr = data_col;
+    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    const Dtype* data_im_ptr = data_im;
+    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h = h_in + i;
+        int w = w_in + j;
+        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+        data_im_ptr[i * width + j] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
 }
 
 template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		Dtype* data_col) {
-	// We are going to launch channels * height_col * width_col kernels, each
-	// kernel responsible for copying a single-channel grid.
-	int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-	int num_kernels = channels * height_col * width_col;
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-	CAFFE_CUDA_NUM_THREADS>>>(
-			num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-			pad_w, stride_h, stride_w, height_col,
-			width_col, data_col);
-	CUDA_POST_KERNEL_CHECK;
+void im2col_gpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
+  CAFFE_CUDA_NUM_THREADS>>>(
+      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+      pad_w, stride_h, stride_w, height_col,
+      width_col, data_col);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		float* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_col);
 template void im2col_gpu<double>(const double* data_im, const int channels,
-		const int height, const int width, const int kernel_h, const int kernel_w,
-		const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-		double* data_col);
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_col);
 
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
-		const int height, const int width, const int channels,
-		const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w,
-		const int stride_h, const int stride_w,
-		const int height_col, const int width_col,
-		Dtype* data_im) {
-	CUDA_KERNEL_LOOP(index, n) {
-		Dtype val = 0;
-		int w = index % width + pad_w;
-		int h = (index / width) % height + pad_h;
-		int c = index / (width * height);
-		// compute the start and end of the output
-		int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-		int w_col_end = min(w / stride_w + 1, width_col);
-		int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-		int h_col_end = min(h / stride_h + 1, height_col);
-		// equivalent implementation
-		int offset =
-		(c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-		int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
-		int coeff_w_col = (1 - stride_w * height_col * width_col);
-		for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-			for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-				val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-			}
-		}
-		data_im[index] = val;
-	}
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    Dtype* data_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    Dtype val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height);
+    // compute the start and end of the output
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+    // equivalent implementation
+    int offset =
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
 }
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, Dtype* data_im) {
-	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-	int num_kernels = channels * height * width;
-	// To avoid involving atomic operations, we will launch one kernel per
-	// bottom dimension, and then in the kernel add up the top dimensions.
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-	CAFFE_CUDA_NUM_THREADS>>>(
-			num_kernels, data_col, height, width, channels, patch_h, patch_w,
-			pad_h, pad_w, stride_h, stride_w,
-			height_col, width_col, data_im);
-	CUDA_POST_KERNEL_CHECK;
+void col2im_gpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
+  CAFFE_CUDA_NUM_THREADS>>>(
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      pad_h, pad_w, stride_h, stride_w,
+      height_col, width_col, data_im);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, float* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
-		const int height, const int width, const int patch_h, const int patch_w,
-		const int pad_h, const int pad_w, const int stride_h,
-		const int stride_w, double* data_im);
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_im);
 
 }  // namespace caffe
diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp
index 299d1fd0..7974b0ea 100644
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@@ -10,135 +10,136 @@
 namespace caffe {
 
 void InsertSplits(const NetParameter& param, NetParameter* param_split) {
-	// Initialize by copying from the input NetParameter.
-	param_split->CopyFrom(param);
-	param_split->clear_layer();
-	map<string, pair<int, int> > blob_name_to_last_top_idx;
-	map<pair<int, int>, pair<int, int> > bottom_idx_to_source_top_idx;
-	map<pair<int, int>, int> top_idx_to_bottom_count;
-	map<pair<int, int>, float> top_idx_to_loss_weight;
-	map<pair<int, int>, int> top_idx_to_bottom_split_idx;
-	map<int, string> layer_idx_to_layer_name;
-	layer_idx_to_layer_name[-1] = "input";
-	// Determine the number of times each blob is used as an input (bottom) blob.
-	for (int i = 0; i < param.input_size(); ++i) {
-		const string& blob_name = param.input(i);
-		blob_name_to_last_top_idx[blob_name] = make_pair(-1, i);
-	}
-	for (int i = 0; i < param.layer_size(); ++i) {
-		const LayerParameter& layer_param = param.layer(i);
-		layer_idx_to_layer_name[i] = layer_param.name();
-		for (int j = 0; j < layer_param.bottom_size(); ++j) {
-			const string& blob_name = layer_param.bottom(j);
-			if (blob_name_to_last_top_idx.find(blob_name) ==
-					blob_name_to_last_top_idx.end()) {
-				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
-			}
-			const pair<int, int>& bottom_idx = make_pair(i, j);
-			const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
-			bottom_idx_to_source_top_idx[bottom_idx] = top_idx;
-			++top_idx_to_bottom_count[top_idx];
-		}
-		for (int j = 0; j < layer_param.top_size(); ++j) {
-			const string& blob_name = layer_param.top(j);
-			blob_name_to_last_top_idx[blob_name] = make_pair(i, j);
-		}
-		// A use of a top blob as a loss should be handled similarly to the use of
-		// a top blob as an input (bottom) blob to another layer.
-		const int last_loss =
-				std::min(layer_param.loss_weight_size(), layer_param.top_size());
-		for (int j = 0; j < last_loss; ++j) {
-			const string& blob_name = layer_param.top(j);
-			const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
-			top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j);
-			if (top_idx_to_loss_weight[top_idx]) {
-				++top_idx_to_bottom_count[top_idx];
-			}
-		}
-	}
-	// Create split layer for any input blobs used by other layer as bottom
-	// blobs more than once.
-	for (int i = 0; i < param.input_size(); ++i) {
-		const int split_count = top_idx_to_bottom_count[make_pair(-1, i)];
-		if (split_count > 1) {
-			const string& layer_name = layer_idx_to_layer_name[-1];
-			const string& blob_name = param.input(i);
-			LayerParameter* split_layer_param = param_split->add_layer();
-			const float kZeroLossWeight = 0;
-			ConfigureSplitLayer(layer_name, blob_name, i, split_count,
-					kZeroLossWeight, split_layer_param);
-		}
-	}
-	for (int i = 0; i < param.layer_size(); ++i) {
-		LayerParameter* layer_param = param_split->add_layer();
-		layer_param->CopyFrom(param.layer(i));
-		// Replace any shared bottom blobs with split layer outputs.
-		for (int j = 0; j < layer_param->bottom_size(); ++j) {
-			const pair<int, int>& top_idx =
-					bottom_idx_to_source_top_idx[make_pair(i, j)];
-			const int split_count = top_idx_to_bottom_count[top_idx];
-			if (split_count > 1) {
-				const string& layer_name = layer_idx_to_layer_name[top_idx.first];
-				const string& blob_name = layer_param->bottom(j);
-				layer_param->set_bottom(j, SplitBlobName(layer_name,
-						blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
-			}
-		}
-		// Create split layer for any top blobs used by other layer as bottom
-		// blobs more than once.
-		for (int j = 0; j < layer_param->top_size(); ++j) {
-			const pair<int, int>& top_idx = make_pair(i, j);
-			const int split_count = top_idx_to_bottom_count[top_idx];
-			if (split_count > 1) {
-				const string& layer_name = layer_idx_to_layer_name[i];
-				const string& blob_name = layer_param->top(j);
-				LayerParameter* split_layer_param = param_split->add_layer();
-				const float loss_weight = top_idx_to_loss_weight[top_idx];
-				ConfigureSplitLayer(layer_name, blob_name, j, split_count,
-						loss_weight, split_layer_param);
-				if (loss_weight) {
-					layer_param->clear_loss_weight();
-					top_idx_to_bottom_split_idx[top_idx]++;
-				}
-			}
-		}
-	}
+  // Initialize by copying from the input NetParameter.
+  param_split->CopyFrom(param);
+  param_split->clear_layer();
+  map<string, pair<int, int> > blob_name_to_last_top_idx;
+  map<pair<int, int>, pair<int, int> > bottom_idx_to_source_top_idx;
+  map<pair<int, int>, int> top_idx_to_bottom_count;
+  map<pair<int, int>, float> top_idx_to_loss_weight;
+  map<pair<int, int>, int> top_idx_to_bottom_split_idx;
+  map<int, string> layer_idx_to_layer_name;
+  layer_idx_to_layer_name[-1] = "input";
+  // Determine the number of times each blob is used as an input (bottom) blob.
+  for (int i = 0; i < param.input_size(); ++i) {
+    const string& blob_name = param.input(i);
+    blob_name_to_last_top_idx[blob_name] = make_pair(-1, i);
+  }
+  for (int i = 0; i < param.layer_size(); ++i) {
+    const LayerParameter& layer_param = param.layer(i);
+    layer_idx_to_layer_name[i] = layer_param.name();
+    for (int j = 0; j < layer_param.bottom_size(); ++j) {
+      const string& blob_name = layer_param.bottom(j);
+      if (blob_name_to_last_top_idx.find(blob_name)
+          == blob_name_to_last_top_idx.end()) {
+        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+      }
+      const pair<int, int>& bottom_idx = make_pair(i, j);
+      const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
+      bottom_idx_to_source_top_idx[bottom_idx] = top_idx;
+      ++top_idx_to_bottom_count[top_idx];
+    }
+    for (int j = 0; j < layer_param.top_size(); ++j) {
+      const string& blob_name = layer_param.top(j);
+      blob_name_to_last_top_idx[blob_name] = make_pair(i, j);
+    }
+    // A use of a top blob as a loss should be handled similarly to the use of
+    // a top blob as an input (bottom) blob to another layer.
+    const int last_loss = std::min(layer_param.loss_weight_size(),
+        layer_param.top_size());
+    for (int j = 0; j < last_loss; ++j) {
+      const string& blob_name = layer_param.top(j);
+      const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
+      top_idx_to_loss_weight[top_idx] = layer_param.loss_weight(j);
+      if (top_idx_to_loss_weight[top_idx]) {
+        ++top_idx_to_bottom_count[top_idx];
+      }
+    }
+  }
+  // Create split layer for any input blobs used by other layer as bottom
+  // blobs more than once.
+  for (int i = 0; i < param.input_size(); ++i) {
+    const int split_count = top_idx_to_bottom_count[make_pair(-1, i)];
+    if (split_count > 1) {
+      const string& layer_name = layer_idx_to_layer_name[-1];
+      const string& blob_name = param.input(i);
+      LayerParameter* split_layer_param = param_split->add_layer();
+      const float kZeroLossWeight = 0;
+      ConfigureSplitLayer(layer_name, blob_name, i, split_count,
+          kZeroLossWeight, split_layer_param);
+    }
+  }
+  for (int i = 0; i < param.layer_size(); ++i) {
+    LayerParameter* layer_param = param_split->add_layer();
+    layer_param->CopyFrom(param.layer(i));
+    // Replace any shared bottom blobs with split layer outputs.
+    for (int j = 0; j < layer_param->bottom_size(); ++j) {
+      const pair<int, int>& top_idx = bottom_idx_to_source_top_idx[make_pair(i,
+          j)];
+      const int split_count = top_idx_to_bottom_count[top_idx];
+      if (split_count > 1) {
+        const string& layer_name = layer_idx_to_layer_name[top_idx.first];
+        const string& blob_name = layer_param->bottom(j);
+        layer_param->set_bottom(j,
+            SplitBlobName(layer_name, blob_name, top_idx.second,
+                top_idx_to_bottom_split_idx[top_idx]++));
+      }
+    }
+    // Create split layer for any top blobs used by other layer as bottom
+    // blobs more than once.
+    for (int j = 0; j < layer_param->top_size(); ++j) {
+      const pair<int, int>& top_idx = make_pair(i, j);
+      const int split_count = top_idx_to_bottom_count[top_idx];
+      if (split_count > 1) {
+        const string& layer_name = layer_idx_to_layer_name[i];
+        const string& blob_name = layer_param->top(j);
+        LayerParameter* split_layer_param = param_split->add_layer();
+        const float loss_weight = top_idx_to_loss_weight[top_idx];
+        ConfigureSplitLayer(layer_name, blob_name, j, split_count, loss_weight,
+            split_layer_param);
+        if (loss_weight) {
+          layer_param->clear_loss_weight();
+          top_idx_to_bottom_split_idx[top_idx]++;
+        }
+      }
+    }
+  }
 }
 
 void ConfigureSplitLayer(const string& layer_name, const string& blob_name,
-		const int blob_idx, const int split_count, const float loss_weight,
-		LayerParameter* split_layer_param) {
-	split_layer_param->Clear();
-	split_layer_param->add_bottom(blob_name);
-	split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx));
-	split_layer_param->set_type("Split");
-	for (int k = 0; k < split_count; ++k) {
-		split_layer_param->add_top(
-				SplitBlobName(layer_name, blob_name, blob_idx, k));
-		if (loss_weight) {
-			if (k == 0) {
-				split_layer_param->add_loss_weight(loss_weight);
-			} else {
-				split_layer_param->add_loss_weight(0);
-			}
-		}
-	}
+    const int blob_idx, const int split_count, const float loss_weight,
+    LayerParameter* split_layer_param) {
+  split_layer_param->Clear();
+  split_layer_param->add_bottom(blob_name);
+  split_layer_param->set_name(SplitLayerName(layer_name, blob_name, blob_idx));
+  split_layer_param->set_type("Split");
+  for (int k = 0; k < split_count; ++k) {
+    split_layer_param->add_top(
+        SplitBlobName(layer_name, blob_name, blob_idx, k));
+    if (loss_weight) {
+      if (k == 0) {
+        split_layer_param->add_loss_weight(loss_weight);
+      } else {
+        split_layer_param->add_loss_weight(0);
+      }
+    }
+  }
 }
 
 string SplitLayerName(const string& layer_name, const string& blob_name,
-		const int blob_idx) {
-	ostringstream split_layer_name;
-	split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
-			<< "_split";
-	return split_layer_name.str();
+    const int blob_idx) {
+  ostringstream split_layer_name;
+  split_layer_name << blob_name << "_" << layer_name << "_" << blob_idx
+      << "_split";
+  return split_layer_name.str();
 }
 
 string SplitBlobName(const string& layer_name, const string& blob_name,
-		const int blob_idx, const int split_idx) {
-	ostringstream split_blob_name;
-	split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
-			<< "_split_" << split_idx;
-	return split_blob_name.str();
+    const int blob_idx, const int split_idx) {
+  ostringstream split_blob_name;
+  split_blob_name << blob_name << "_" << layer_name << "_" << blob_idx
+      << "_split_" << split_idx;
+  return split_blob_name.str();
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 63dcf312..09824880 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -30,277 +30,271 @@ using google::protobuf::io::CodedOutputStream;
 using google::protobuf::Message;
 
 bool ReadProtoFromTextFile(const char* filename, Message* proto) {
-	int fd = open(filename, O_RDONLY);
-	CHECK_NE(fd, -1) << "File not found: " << filename;
-	FileInputStream* input = new FileInputStream(fd);
-	bool success = google::protobuf::TextFormat::Parse(input, proto);
-	delete input;
-	close(fd);
-	return success;
+  int fd = open(filename, O_RDONLY);
+  CHECK_NE(fd, -1) << "File not found: " << filename;
+  FileInputStream* input = new FileInputStream(fd);
+  bool success = google::protobuf::TextFormat::Parse(input, proto);
+  delete input;
+  close(fd);
+  return success;
 }
 
 void WriteProtoToTextFile(const Message& proto, const char* filename) {
-	int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-	FileOutputStream* output = new FileOutputStream(fd);
-	CHECK(google::protobuf::TextFormat::Print(proto, output));
-	delete output;
-	close(fd);
+  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  FileOutputStream* output = new FileOutputStream(fd);
+  CHECK(google::protobuf::TextFormat::Print(proto, output));
+  delete output;
+  close(fd);
 }
 
 bool ReadProtoFromBinaryFile(const char* filename, Message* proto) {
-	int fd = open(filename, O_RDONLY);
-	CHECK_NE(fd, -1) << "File not found: " << filename;
-	ZeroCopyInputStream* raw_input = new FileInputStream(fd);
-	CodedInputStream* coded_input = new CodedInputStream(raw_input);
-	coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912);
+  int fd = open(filename, O_RDONLY);
+  CHECK_NE(fd, -1) << "File not found: " << filename;
+  ZeroCopyInputStream* raw_input = new FileInputStream(fd);
+  CodedInputStream* coded_input = new CodedInputStream(raw_input);
+  coded_input->SetTotalBytesLimit(kProtoReadBytesLimit, 536870912);
 
-	bool success = proto->ParseFromCodedStream(coded_input);
+  bool success = proto->ParseFromCodedStream(coded_input);
 
-	delete coded_input;
-	delete raw_input;
-	close(fd);
-	return success;
+  delete coded_input;
+  delete raw_input;
+  close(fd);
+  return success;
 }
 
 void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
-	fstream output(filename, ios::out | ios::trunc | ios::binary);
-	CHECK(proto.SerializeToOstream(&output));
+  fstream output(filename, ios::out | ios::trunc | ios::binary);
+  CHECK(proto.SerializeToOstream(&output));
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const int height, const int width, const bool is_color) {
-	cv::Mat cv_img;
-	int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-																	CV_LOAD_IMAGE_GRAYSCALE);
-	cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
-	if (!cv_img_origin.data) {
-		LOG(ERROR) << "Could not open or find file " << filename;
-		return cv_img_origin;
-	}
-	if (height > 0 && width > 0) {
-		cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
-	} else {
-		cv_img = cv_img_origin;
-	}
-	return cv_img;
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const bool is_color) {
+  cv::Mat cv_img;
+  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+  cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
+  if (!cv_img_origin.data) {
+    LOG(ERROR) << "Could not open or find file " << filename;
+    return cv_img_origin;
+  }
+  if (height > 0 && width > 0) {
+    cv::resize(cv_img_origin, cv_img, cv::Size(width, height));
+  } else {
+    cv_img = cv_img_origin;
+  }
+  return cv_img;
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const int height, const int width) {
-	return ReadImageToCVMat(filename, height, width, true);
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width) {
+  return ReadImageToCVMat(filename, height, width, true);
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-		const bool is_color) {
-	return ReadImageToCVMat(filename, 0, 0, is_color);
+cv::Mat ReadImageToCVMat(const string& filename, const bool is_color) {
+  return ReadImageToCVMat(filename, 0, 0, is_color);
 }
 
 cv::Mat ReadImageToCVMat(const string& filename) {
-	return ReadImageToCVMat(filename, 0, 0, true);
+  return ReadImageToCVMat(filename, 0, 0, true);
 }
 // Do the file extension and encoding match?
-static bool matchExt(const std::string & fn,
-		std::string en) {
-	size_t p = fn.rfind('.');
-	std::string ext = p != fn.npos ? fn.substr(p) : fn;
-	std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-	std::transform(en.begin(), en.end(), en.begin(), ::tolower);
-	if (ext == en)
-		return true;
-	if (en == "jpg" && ext == "jpeg")
-		return true;
-	return false;
+static bool matchExt(const std::string & fn, std::string en) {
+  size_t p = fn.rfind('.');
+  std::string ext = p != fn.npos ? fn.substr(p) : fn;
+  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+  std::transform(en.begin(), en.end(), en.begin(), ::tolower);
+  if (ext == en)
+    return true;
+  if (en == "jpg" && ext == "jpeg")
+    return true;
+  return false;
 }
-bool ReadImageToDatum(const string& filename, const int label,
-		const int height, const int width, const bool is_color,
-		const std::string & encoding, Datum* datum) {
-	cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
-	if (cv_img.data) {
-		if (encoding.size()) {
-			if ((cv_img.channels() == 3) == is_color && !height && !width &&
-					matchExt(filename, encoding))
-				return ReadFileToDatum(filename, label, datum);
-			std::vector < uchar > buf;
-			cv::imencode("." + encoding, cv_img, buf);
-			datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
-					buf.size()));
-			datum->set_label(label);
-			datum->set_encoded(true);
-			return true;
-		}
-		CVMatToDatum(cv_img, datum);
-		datum->set_label(label);
-		return true;
-	} else {
-		return false;
-	}
+bool ReadImageToDatum(const string& filename, const int label, const int height,
+    const int width, const bool is_color, const std::string & encoding,
+    Datum* datum) {
+  cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
+  if (cv_img.data) {
+    if (encoding.size()) {
+      if ((cv_img.channels() == 3) == is_color && !height && !width
+          && matchExt(filename, encoding))
+        return ReadFileToDatum(filename, label, datum);
+      std::vector < uchar > buf;
+      cv::imencode("." + encoding, cv_img, buf);
+      datum->set_data(
+          std::string(reinterpret_cast<char*>(&buf[0]), buf.size()));
+      datum->set_label(label);
+      datum->set_encoded(true);
+      return true;
+    }
+    CVMatToDatum(cv_img, datum);
+    datum->set_label(label);
+    return true;
+  } else {
+    return false;
+  }
 }
 
-bool ReadFileToDatum(const string& filename, const int label,
-		Datum* datum) {
-	std::streampos size;
+bool ReadFileToDatum(const string& filename, const int label, Datum* datum) {
+  std::streampos size;
 
-	fstream file(filename.c_str(), ios::in | ios::binary | ios::ate);
-	if (file.is_open()) {
-		size = file.tellg();
-		std::string buffer(size, ' ');
-		file.seekg(0, ios::beg);
-		file.read(&buffer[0], size);
-		file.close();
-		datum->set_data(buffer);
-		datum->set_label(label);
-		datum->set_encoded(true);
-		return true;
-	} else {
-		return false;
-	}
+  fstream file(filename.c_str(), ios::in | ios::binary | ios::ate);
+  if (file.is_open()) {
+    size = file.tellg();
+    std::string buffer(size, ' ');
+    file.seekg(0, ios::beg);
+    file.read(&buffer[0], size);
+    file.close();
+    datum->set_data(buffer);
+    datum->set_label(label);
+    datum->set_encoded(true);
+    return true;
+  } else {
+    return false;
+  }
 }
 
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
-	cv::Mat cv_img;
-	CHECK(datum.encoded()) << "Datum not encoded";
-	const string& data = datum.data();
-	std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-	cv_img = cv::imdecode(vec_data, -1);
-	if (!cv_img.data) {
-		LOG(ERROR) << "Could not decode datum ";
-	}
-	return cv_img;
+  cv::Mat cv_img;
+  CHECK(datum.encoded()) << "Datum not encoded";
+  const string& data = datum.data();
+  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
+  cv_img = cv::imdecode(vec_data, -1);
+  if (!cv_img.data) {
+    LOG(ERROR) << "Could not decode datum ";
+  }
+  return cv_img;
 }
 cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) {
-	cv::Mat cv_img;
-	CHECK(datum.encoded()) << "Datum not encoded";
-	const string& data = datum.data();
-	std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-	int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-																	CV_LOAD_IMAGE_GRAYSCALE);
-	cv_img = cv::imdecode(vec_data, cv_read_flag);
-	if (!cv_img.data) {
-		LOG(ERROR) << "Could not decode datum ";
-	}
-	return cv_img;
+  cv::Mat cv_img;
+  CHECK(datum.encoded()) << "Datum not encoded";
+  const string& data = datum.data();
+  std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
+  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
+  cv_img = cv::imdecode(vec_data, cv_read_flag);
+  if (!cv_img.data) {
+    LOG(ERROR) << "Could not decode datum ";
+  }
+  return cv_img;
 }
 
 // If Datum is encoded will decoded using DecodeDatumToCVMat and CVMatToDatum
 // If Datum is not encoded will do nothing
 bool DecodeDatumNative(Datum* datum) {
-	if (datum->encoded()) {
-		cv::Mat cv_img = DecodeDatumToCVMatNative((*datum));
-		CVMatToDatum(cv_img, datum);
-		return true;
-	} else {
-		return false;
-	}
+  if (datum->encoded()) {
+    cv::Mat cv_img = DecodeDatumToCVMatNative((*datum));
+    CVMatToDatum(cv_img, datum);
+    return true;
+  } else {
+    return false;
+  }
 }
 bool DecodeDatum(Datum* datum, bool is_color) {
-	if (datum->encoded()) {
-		cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color);
-		CVMatToDatum(cv_img, datum);
-		return true;
-	} else {
-		return false;
-	}
+  if (datum->encoded()) {
+    cv::Mat cv_img = DecodeDatumToCVMat((*datum), is_color);
+    CVMatToDatum(cv_img, datum);
+    return true;
+  } else {
+    return false;
+  }
 }
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
-	CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
-	datum->set_channels(cv_img.channels());
-	datum->set_height(cv_img.rows);
-	datum->set_width(cv_img.cols);
-	datum->clear_data();
-	datum->clear_float_data();
-	datum->set_encoded(false);
-	int datum_channels = datum->channels();
-	int datum_height = datum->height();
-	int datum_width = datum->width();
-	int datum_size = datum_channels * datum_height * datum_width;
-	std::string buffer(datum_size, ' ');
-	for (int h = 0; h < datum_height; ++h) {
-		const uchar* ptr = cv_img.ptr < uchar > (h);
-		int img_index = 0;
-		for (int w = 0; w < datum_width; ++w) {
-			for (int c = 0; c < datum_channels; ++c) {
-				int datum_index = (c * datum_height + h) * datum_width + w;
-				buffer[datum_index] = static_cast<char>(ptr[img_index++]);
-			}
-		}
-	}
-	datum->set_data(buffer);
+  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+  datum->set_channels(cv_img.channels());
+  datum->set_height(cv_img.rows);
+  datum->set_width(cv_img.cols);
+  datum->clear_data();
+  datum->clear_float_data();
+  datum->set_encoded(false);
+  int datum_channels = datum->channels();
+  int datum_height = datum->height();
+  int datum_width = datum->width();
+  int datum_size = datum_channels * datum_height * datum_width;
+  std::string buffer(datum_size, ' ');
+  for (int h = 0; h < datum_height; ++h) {
+    const uchar* ptr = cv_img.ptr < uchar > (h);
+    int img_index = 0;
+    for (int w = 0; w < datum_width; ++w) {
+      for (int c = 0; c < datum_channels; ++c) {
+        int datum_index = (c * datum_height + h) * datum_width + w;
+        buffer[datum_index] = static_cast<char>(ptr[img_index++]);
+      }
+    }
+  }
+  datum->set_data(buffer);
 }
 
 // Verifies format of data stored in HDF5 file and reshapes blob accordingly.
 template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-		hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-		Blob<Dtype>* blob) {
-	// Verify that the dataset exists.
-	CHECK(H5LTfind_dataset(file_id, dataset_name_))
-			<< "Failed to find HDF5 dataset " << dataset_name_;
-	// Verify that the number of dimensions is in the accepted range.
-	herr_t status;
-	int ndims;
-	status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
-	CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
-	CHECK_GE(ndims, min_dim);
-	CHECK_LE(ndims, max_dim);
+void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_,
+    int min_dim, int max_dim, Blob<Dtype>* blob) {
+  // Verify that the dataset exists.
+  CHECK(H5LTfind_dataset(file_id, dataset_name_))
+      << "Failed to find HDF5 dataset " << dataset_name_;
+  // Verify that the number of dimensions is in the accepted range.
+  herr_t status;
+  int ndims;
+  status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
+  CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
+  CHECK_GE(ndims, min_dim);
+  CHECK_LE(ndims, max_dim);
 
-	// Verify that the data format is what we expect: float or double.
-	std::vector < hsize_t > dims(ndims);
-	H5T_class_t class_;
-	status = H5LTget_dataset_info(
-			file_id, dataset_name_, dims.data(), &class_, NULL);
-	CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
-	CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
+  // Verify that the data format is what we expect: float or double.
+  std::vector < hsize_t > dims(ndims);
+  H5T_class_t class_;
+  status = H5LTget_dataset_info(file_id, dataset_name_, dims.data(), &class_,
+      NULL);
+  CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
+  CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
 
-	vector<int> blob_dims(dims.size());
-	for (int i = 0; i < dims.size(); ++i) {
-		blob_dims[i] = dims[i];
-	}
-	blob->Reshape(blob_dims);
+  vector<int> blob_dims(dims.size());
+  for (int i = 0; i < dims.size(); ++i) {
+    blob_dims[i] = dims[i];
+  }
+  blob->Reshape(blob_dims);
 }
 
 template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-		int min_dim, int max_dim, Blob<float>* blob) {
-	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-	herr_t status = H5LTread_dataset_float(
-			file_id, dataset_name_, blob->mutable_cpu_data());
-	CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
+    int min_dim, int max_dim, Blob<float>* blob) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+  herr_t status = H5LTread_dataset_float(file_id, dataset_name_,
+      blob->mutable_cpu_data());
+  CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
 }
 
 template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-		int min_dim, int max_dim, Blob<double>* blob) {
-	hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-	herr_t status = H5LTread_dataset_double(
-			file_id, dataset_name_, blob->mutable_cpu_data());
-	CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
+    int min_dim, int max_dim, Blob<double>* blob) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+  herr_t status = H5LTread_dataset_double(file_id, dataset_name_,
+      blob->mutable_cpu_data());
+  CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
 }
 
 template <>
-void hdf5_save_nd_dataset<float>(
-		const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
-	hsize_t dims[HDF5_NUM_DIMS];
-	dims[0] = blob.num();
-	dims[1] = blob.channels();
-	dims[2] = blob.height();
-	dims[3] = blob.width();
-	herr_t status = H5LTmake_dataset_float(
-			file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-	CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
+void hdf5_save_nd_dataset<float>(const hid_t file_id,
+    const string& dataset_name, const Blob<float>& blob) {
+  hsize_t dims[HDF5_NUM_DIMS];
+  dims[0] = blob.num();
+  dims[1] = blob.channels();
+  dims[2] = blob.height();
+  dims[3] = blob.width();
+  herr_t status = H5LTmake_dataset_float(file_id, dataset_name.c_str(),
+      HDF5_NUM_DIMS, dims, blob.cpu_data());
+  CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
 }
 
 template <>
-void hdf5_save_nd_dataset<double>(
-		const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
-	hsize_t dims[HDF5_NUM_DIMS];
-	dims[0] = blob.num();
-	dims[1] = blob.channels();
-	dims[2] = blob.height();
-	dims[3] = blob.width();
-	herr_t status = H5LTmake_dataset_double(
-			file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-	CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
+void hdf5_save_nd_dataset<double>(const hid_t file_id,
+    const string& dataset_name, const Blob<double>& blob) {
+  hsize_t dims[HDF5_NUM_DIMS];
+  dims[0] = blob.num();
+  dims[1] = blob.channels();
+  dims[2] = blob.height();
+  dims[3] = blob.width();
+  herr_t status = H5LTmake_dataset_double(file_id, dataset_name.c_str(),
+      HDF5_NUM_DIMS, dims, blob.cpu_data());
+  CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3275d75c..0dfb1107 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -42,319 +42,308 @@ namespace caffe {
 
 template <>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const float alpha, const float* A, const float* B, const float beta,
-		float* C) {
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-			ldb, beta, C, N);
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+      beta, C, N);
 }
 
 template <>
 void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const double alpha, const double* A, const double* B, const double beta,
-		double* C) {
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-			ldb, beta, C, N);
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+      beta, C, N);
 }
 
 template <>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const float alpha, const float* A, const float* B, const float beta,
-		float* C) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	//AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
-	CLBLAS_CHECK(
-			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
-					0,
-					ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+          0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const double alpha, const double* A, const double* B, const double beta,
-		double* C) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	CLBLAS_CHECK(
-			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
-					0,
-					ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+          0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 cl_event caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const float alpha, const float* A, const int offA, const float* B,
-		const int offB, const float beta, float* C, const int offC) {
-	cl_event event;
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	CLBLAS_CHECK(
-			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
-					(cl_mem) C,
-					offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
-	return event;
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const int offA, const float* B,
+    const int offB, const float beta, float* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL,
+          &event));
+  return event;
 }
 
 template <>
 cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const double alpha, const double* A, const int offA, const double* B,
-		const int offB, const double beta, double* C, const int offC) {
-	cl_event event;
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	CLBLAS_CHECK(
-			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
-					(cl_mem) C,
-					offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, &event));
-	return event;
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int offA, const double* B,
+    const int offB, const double beta, double* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL,
+          &event));
+  return event;
 }
 
 template <>
 cl_event caffe_gpu_gemm<float>(cl_command_queue *queue,
-		const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const float alpha, const float* A, const int offA, const float* B,
-		const int offB, const float beta, float* C, const int offC) {
-	cl_event event;
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	CLBLAS_CHECK(
-			clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
-					(cl_mem) C,
-					offC, ldc, 1, queue, 0, NULL, &event));
-	return event;
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M,
+    const int N, const int K, const float alpha, const float* A, const int offA,
+    const float* B, const int offB, const float beta, float* C,
+    const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event));
+  return event;
 }
 
 template <>
 cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
-		const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const double alpha, const double* A, const int offA, const double* B,
-		const int offB, const double beta, double* C, const int offC) {
-	cl_event event;
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	clblasTranspose transB =
-			(TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	int ldc = N;
-	CLBLAS_CHECK(
-			clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-					(cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
-					(cl_mem) C,
-					offC, ldc, 1, queue, 0, NULL, &event));
-	return event;
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M,
+    const int N, const int K, const double alpha, const double* A,
+    const int offA, const double* B, const int offB, const double beta,
+    double* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event));
+  return event;
 }
 
 template <>
 void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const float alpha, const float* A, const float* x,
-		const float beta, float* y) {
-	cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
 template <>
 void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const double alpha, const double* A, const double* x,
-		const double beta, double* y) {
-	cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
 template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const float alpha, const float* A, size_t offA, int lda,
-		const float* x, size_t offx, const float beta, int incx,
-		float* y, size_t offy, int incy) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
-			M, N, (cl_float) alpha, (cl_mem) A, offA, lda,
-			(cl_mem) x, offx, incx, (cl_float) beta,
-			(cl_mem) y, offy, incy,
-			1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    const int N, const float alpha, const float* A, size_t offA, int lda,
+    const float* x, size_t offx, const float beta, int incx, float* y,
+    size_t offy, int incy) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A,
+          offA, lda, (cl_mem) x, offx, incx, (cl_float) beta, (cl_mem) y, offy,
+          incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const double alpha, const double* A, size_t offA, int lda,
-		const double* x, size_t offx, const double beta, int incx,
-		double* y, size_t offy, int incy) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	CLBLAS_CHECK(
-			clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
-					offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
-					incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    const int N, const double alpha, const double* A, size_t offA, int lda,
+    const double* x, size_t offx, const double beta, int incx, double* y,
+    size_t offy, int incy) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
+          offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
+          incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 
 }
 
 template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const float alpha, const float* A, const float* x,
-		const float beta, float* y) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	CLBLAS_CHECK(clblasSgemv(amdDevice.row, transA,
-			M, N, (cl_float) alpha, (cl_mem) A, 0, N,
-			(cl_mem) x, 0, 1, (cl_float) beta,
-			(cl_mem) y, 0, 1,
-			1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, 0,
+          N, (cl_mem) x, 0, 1, (cl_float) beta, (cl_mem) y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const double alpha, const double* A, const double* x,
-		const double beta, double* y) {
-	clblasTranspose transA =
-			(TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
-	CLBLAS_CHECK(
-			clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
-					N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
-					&(amdDevice.CommandQueue), 0, NULL, NULL));
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
+          N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 void caffe_axpy<float>(const int N, const float alpha, const float* X,
-		float* Y) {
-	cblas_saxpy(N, alpha, X, 1, Y, 1);
+    float* Y) {
+  cblas_saxpy(N, alpha, X, 1, Y, 1);
 }
 
 template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
-		double* Y) {
-	cblas_daxpy(N, alpha, X, 1, Y, 1);
+    double* Y) {
+  cblas_daxpy(N, alpha, X, 1, Y, 1);
 }
 
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-		float* Y) {
-	CLBLAS_CHECK(
-			clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-					&(amdDevice.CommandQueue), 0, NULL, NULL));
+    float* Y) {
+  CLBLAS_CHECK(
+      clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-		double* Y) {
-	CLBLAS_CHECK(
-			clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-					&(amdDevice.CommandQueue), 0, NULL, NULL));
+    double* Y) {
+  CLBLAS_CHECK(
+      clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
 template <>
-void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y)
-		{
+void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
 }
 
 template <>
-void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y)
-		{
+void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
 }
 
 template <>
-void caffe_gpu_abs<float>(const int n, const float* x, float* y)
-		{
-	caffe_gpu_abs_ocl(n, x, y);
+void caffe_gpu_abs<float>(const int n, const float* x, float* y) {
+  caffe_gpu_abs_ocl(n, x, y);
 }
 
 template <>
-void caffe_gpu_abs<double>(const int n, const double* x, double* y)
-		{
-	caffe_gpu_abs_ocl(n, x, y);
+void caffe_gpu_abs<double>(const int n, const double* x, double* y) {
+  caffe_gpu_abs_ocl(n, x, y);
 }
 
 template <>
 void caffe_set(const int N, const float alpha, float* Y) {
-	if (alpha == 0) {
-		memset(Y, 0, sizeof(float) * N);
-		return;
-	}
-	for (int i = 0; i < N; ++i) {
-		Y[i] = alpha;
-	}
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(float) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
 }
 
 template <>
 void caffe_set(const int N, const double alpha, double* Y) {
-	if (alpha == 0) {
-		memset(Y, 0, sizeof(double) * N);
-		return;
-	}
-	for (int i = 0; i < N; ++i) {
-		Y[i] = alpha;
-	}
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(double) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
 }
 
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
-	for (int i = 0; i < N; ++i) {
-		Y[i] += alpha;
-	}
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
 }
 
 template <>
 void caffe_add_scalar(const int N, const double alpha, double* Y) {
-	for (int i = 0; i < N; ++i) {
-		Y[i] += alpha;
-	}
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
 }
 
 template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
-	cblas_scopy(N, X, 1, Y, 1);
+  cblas_scopy(N, X, 1, Y, 1);
 }
 
 template <>
 void caffe_copy<double>(const int N, const double* X, double* Y) {
-	cblas_dcopy(N, X, 1, Y, 1);
+  cblas_dcopy(N, X, 1, Y, 1);
 }
 
 //template <typename Dtype>
-void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
-		{
-	clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
-			NULL, NULL);
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) {
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
+      NULL, NULL);
 // OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
 }
 /*
@@ -364,179 +353,170 @@ void caffe_gpu_memcpy(const size_t N, const void *X, void *Y)
  template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
  */
 template <>
-void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y)
-		{
-	OCL_CHECK(
-			clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
-					N,
-					0, NULL, NULL));
+void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
 }
 
 template <>
-void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y)
-		{
-	OCL_CHECK(
-			clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
-					N,
-					0, NULL, NULL));
+void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
 }
 
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-	if (X != Y) {
-		CLBLAS_CHECK(
-				clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-						&(amdDevice.CommandQueue), 0, NULL, NULL));
-	}
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
 }
 
 template <>
 void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-	if (X != Y) {
-		CLBLAS_CHECK(
-				clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-						&(amdDevice.CommandQueue), 0, NULL, NULL));
-	}
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
 }
 
 template <>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
-	cblas_sscal(N, alpha, X, 1);
+  cblas_sscal(N, alpha, X, 1);
 }
 
 template <>
 void caffe_scal<double>(const int N, const double alpha, double *X) {
-	cblas_dscal(N, alpha, X, 1);
+  cblas_dscal(N, alpha, X, 1);
 }
 
 template <>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-	CLBLAS_CHECK(
-			clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-					NULL, NULL));
+  CLBLAS_CHECK(
+      clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
 }
 
 template <>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-	CLBLAS_CHECK(
-			clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-					NULL, NULL));
+  CLBLAS_CHECK(
+      clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
 }
 
 template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-		const float beta, float* Y) {
-	caffe_gpu_scal<float>(N, beta, Y);
-	caffe_gpu_axpy<float>(N, alpha, X, Y);
+    const float beta, float* Y) {
+  caffe_gpu_scal<float>(N, beta, Y);
+  caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
 template <>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-		const double beta, double* Y) {
-	caffe_gpu_scal<double>(N, beta, Y);
-	caffe_gpu_axpy<double>(N, alpha, X, Y);
+    const double beta, double* Y) {
+  caffe_gpu_scal<double>(N, beta, Y);
+  caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
 template <>
 void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
-		const float beta, float* Y) {
-	cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+    const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
-		const double beta, double* Y) {
-	cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+    const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_add<float>(const int n, const float* a, const float* b,
-		float* y) {
-	vsAdd(n, a, b, y);
+void caffe_add<float>(const int n, const float* a, const float* b, float* y) {
+  vsAdd(n, a, b, y);
 }
 
 template <>
 void caffe_add<double>(const int n, const double* a, const double* b,
-		double* y) {
-	vdAdd(n, a, b, y);
+    double* y) {
+  vdAdd(n, a, b, y);
 }
 
 template <>
-void caffe_sub<float>(const int n, const float* a, const float* b,
-		float* y) {
-	vsSub(n, a, b, y);
+void caffe_sub<float>(const int n, const float* a, const float* b, float* y) {
+  vsSub(n, a, b, y);
 }
 
 template <>
 void caffe_sub<double>(const int n, const double* a, const double* b,
-		double* y) {
-	vdSub(n, a, b, y);
+    double* y) {
+  vdSub(n, a, b, y);
 }
 
 template <>
-void caffe_mul<float>(const int n, const float* a, const float* b,
-		float* y) {
-	vsMul(n, a, b, y);
+void caffe_mul<float>(const int n, const float* a, const float* b, float* y) {
+  vsMul(n, a, b, y);
 }
 
 template <>
 void caffe_mul<double>(const int n, const double* a, const double* b,
-		double* y) {
-	vdMul(n, a, b, y);
+    double* y) {
+  vdMul(n, a, b, y);
 }
 
 template <>
-void caffe_div<float>(const int n, const float* a, const float* b,
-		float* y) {
-	vsDiv(n, a, b, y);
+void caffe_div<float>(const int n, const float* a, const float* b, float* y) {
+  vsDiv(n, a, b, y);
 }
 
 template <>
 void caffe_div<double>(const int n, const double* a, const double* b,
-		double* y) {
-	vdDiv(n, a, b, y);
+    double* y) {
+  vdDiv(n, a, b, y);
 }
 
 template <>
-void caffe_powx<float>(const int n, const float* a, const float b,
-		float* y) {
-	vsPowx(n, a, b, y);
+void caffe_powx<float>(const int n, const float* a, const float b, float* y) {
+  vsPowx(n, a, b, y);
 }
 
 template <>
 void caffe_powx<double>(const int n, const double* a, const double b,
-		double* y) {
-	vdPowx(n, a, b, y);
+    double* y) {
+  vdPowx(n, a, b, y);
 }
 
 template <>
 void caffe_sqr<float>(const int n, const float* a, float* y) {
-	vsSqr(n, a, y);
+  vsSqr(n, a, y);
 }
 
 template <>
 void caffe_sqr<double>(const int n, const double* a, double* y) {
-	vdSqr(n, a, y);
+  vdSqr(n, a, y);
 }
 
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
-	vsExp(n, a, y);
+  vsExp(n, a, y);
 }
 
 template <>
 void caffe_exp<double>(const int n, const double* a, double* y) {
-	vdExp(n, a, y);
+  vdExp(n, a, y);
 }
 
 unsigned int caffe_rng_rand() {
-	return (*caffe_rng())();
+  return (*caffe_rng())();
 }
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-	return boost::math::nextafter < Dtype > (
-			b, std::numeric_limits < Dtype > ::max());
+  return boost::math::nextafter < Dtype
+      > (b, std::numeric_limits < Dtype > ::max());
 }
 
 template
@@ -547,62 +527,62 @@ double caffe_nextafter(const double b);
 
 template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
-	CHECK_GE(n, 0);
-	CHECK(r);
-	CHECK_LE(a, b);
-	boost::uniform_real < Dtype
-			> random_distribution(a, caffe_nextafter<Dtype>(b));
-	boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
-	variate_generator(caffe_rng(), random_distribution);
-	for (int i = 0; i < n; ++i) {
-		r[i] = variate_generator();
-	}
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+  boost::uniform_real < Dtype
+      > random_distribution(a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
 
-	//LOG(INFO) << "caffe_rng_uniform";
+  //LOG(INFO) << "caffe_rng_uniform";
 }
 
 template
 void caffe_rng_uniform<float>(const int n, const float a, const float b,
-		float* r);
+    float* r);
 
 template
 void caffe_rng_uniform<double>(const int n, const double a, const double b,
-		double* r);
+    double* r);
 
 template <typename Dtype>
-void caffe_rng_gaussian(const int n, const Dtype a,
-		const Dtype sigma, Dtype* r) {
-	CHECK_GE(n, 0);
-	CHECK(r);
-	CHECK_GT(sigma, 0);
-	boost::normal_distribution < Dtype > random_distribution(a, sigma);
-	boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
-	variate_generator(caffe_rng(), random_distribution);
-	for (int i = 0; i < n; ++i) {
-		r[i] = variate_generator();
-	}
+void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma,
+    Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  boost::normal_distribution < Dtype > random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
 }
 
 template
-void caffe_rng_gaussian<float>(const int n, const float mu,
-		const float sigma, float* r);
+void caffe_rng_gaussian<float>(const int n, const float mu, const float sigma,
+    float* r);
 
 template
 void caffe_rng_gaussian<double>(const int n, const double mu,
-		const double sigma, double* r);
+    const double sigma, double* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
-	CHECK_GE(n, 0);
-	CHECK(r);
-	CHECK_GE(p, 0);
-	CHECK_LE(p, 1);
-	boost::bernoulli_distribution < Dtype > random_distribution(p);
-	boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-	variate_generator(caffe_rng(), random_distribution);
-	for (int i = 0; i < n; ++i) {
-		r[i] = variate_generator();
-	}
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
 }
 
 template
@@ -613,16 +593,16 @@ void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
-	CHECK_GE(n, 0);
-	CHECK(r);
-	CHECK_GE(p, 0);
-	CHECK_LE(p, 1);
-	boost::bernoulli_distribution < Dtype > random_distribution(p);
-	boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-	variate_generator(caffe_rng(), random_distribution);
-	for (int i = 0; i < n; ++i) {
-		r[i] = static_cast<unsigned int>(variate_generator());
-	}
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = static_cast<unsigned int>(variate_generator());
+  }
 }
 
 template
@@ -633,104 +613,104 @@ void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
 //
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
-	return cblas_sdot(n, x, 1, y, 1);
+  return cblas_sdot(n, x, 1, y, 1);
 }
 
 template <>
 double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
-	return cblas_ddot(n, x, 1, y, 1);
+  return cblas_ddot(n, x, 1, y, 1);
 }
 
 template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-		float* out) {
-	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(n * sizeof(float)), NULL, NULL);
-	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(1 * sizeof(float)), NULL, NULL);
-	clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL);
-	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
-			out, 0, NULL, NULL);
-	clReleaseMemObject(scratchBuff);
-	clReleaseMemObject(d_out);
+    float* out) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(float)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(float)), NULL, NULL);
+  clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
 }
 
 template <>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-		double * out) {
-	//need to pass in scratchBuff
-	//AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
-	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(n * sizeof(double)), NULL, NULL);
-	cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(1 * sizeof(double)), NULL, NULL);
-	clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL);
-	clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
-			out, 0, NULL, NULL);
-	clReleaseMemObject(scratchBuff);
-	clReleaseMemObject(d_out);
+    double * out) {
+  //need to pass in scratchBuff
+  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(double)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(double)), NULL, NULL);
+  clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
 }
 
 template <>
 int caffe_cpu_hamming_distance<float>(const int n, const float* x,
-		const float* y) {
-	int dist = 0;
-	for (int i = 0; i < n; ++i) {
-		dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
-				static_cast<uint32_t>(y[i]));
-	}
-	return dist;
+    const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(
+        static_cast<uint32_t>(x[i]) ^ static_cast<uint32_t>(y[i]));
+  }
+  return dist;
 }
 
 template <>
 int caffe_cpu_hamming_distance<double>(const int n, const double* x,
-		const double* y) {
-	int dist = 0;
-	for (int i = 0; i < n; ++i) {
-		dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
-				static_cast<uint64_t>(y[i]));
-	}
-	return dist;
+    const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(
+        static_cast<uint64_t>(x[i]) ^ static_cast<uint64_t>(y[i]));
+  }
+  return dist;
 }
 
 template <>
 float caffe_cpu_asum<float>(const int n, const float* x) {
-	return cblas_sasum(n, x, 1);
+  return cblas_sasum(n, x, 1);
 }
 
 template <>
 double caffe_cpu_asum<double>(const int n, const double* x) {
-	return cblas_dasum(n, x, 1);
+  return cblas_dasum(n, x, 1);
 }
 
 template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
-	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(n * sizeof(cl_float)), NULL, NULL);
-	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(1 * sizeof(cl_float)), NULL, NULL);
-	clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL);
-	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
-			0, NULL, NULL);
-	clReleaseMemObject(scratchBuff);
-	clReleaseMemObject(d_y);
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_float)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_float)), NULL, NULL);
+  clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
+      0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
 }
 
 template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
-	cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(n * sizeof(cl_double)), NULL, NULL);
-	cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			(1 * sizeof(cl_double)), NULL, NULL);
-	clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
-			&(amdDevice.CommandQueue), 0, NULL, NULL);
-	clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
-			y, 0, NULL, NULL);
-	clReleaseMemObject(scratchBuff);
-	clReleaseMemObject(d_y);
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_double)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_double)), NULL, NULL);
+  clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
+      y, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
 }
 
 //DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
@@ -743,30 +723,30 @@ INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
 
 template <>
 void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
-		float* y) {
-	cblas_scopy(n, x, 1, y, 1);
-	cblas_sscal(n, alpha, y, 1);
+    float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
 }
 
 template <>
 void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
-		double* y) {
-	cblas_dcopy(n, x, 1, y, 1);
-	cblas_dscal(n, alpha, y, 1);
+    double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
 }
 
 template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-		float* y) {
-	caffe_gpu_copy(n, x, y);
-	caffe_gpu_scal(n, alpha, y);
+    float* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
 }
 
 template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-		double* y) {
-	caffe_gpu_copy(n, x, y);
-	caffe_gpu_scal(n, alpha, y);
+    double* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
 }
 
 template <typename Dtype>
@@ -775,114 +755,112 @@ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 
 template <>
 void caffe_gpu_set<float>(const int N, const float alpha, float* Y) {
-	ocl_memset(Y, alpha, N);
+  ocl_memset(Y, alpha, N);
 }
 
 template <>
 void caffe_gpu_set<double>(const int N, const double alpha, double* Y) {
-	ocl_memset(Y, alpha, N);
+  ocl_memset(Y, alpha, N);
 }
 
 template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
-	kernel_add_scalar(N, alpha, Y);
+  kernel_add_scalar(N, alpha, Y);
 }
 
 template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
-	kernel_add_scalar(N, alpha, Y);
+  kernel_add_scalar(N, alpha, Y);
 }
 
 template <>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
-	kernel_exp(N, a, y);
+  kernel_exp(N, a, y);
 }
 
 template <>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
-	kernel_exp(N, a, y);
+  kernel_exp(N, a, y);
 }
 
 template <>
 void caffe_gpu_sign<float>(const int N, const float *X, float *Y) {
-	caffe_gpu_sign_ocl(N, X, Y);
+  caffe_gpu_sign_ocl(N, X, Y);
 }
 
 template <>
 void caffe_gpu_sign<double>(const int N, const double *X, double *Y) {
-	caffe_gpu_sign_ocl(N, X, Y);
+  caffe_gpu_sign_ocl(N, X, Y);
 }
 
 template <>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-		float* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_sub(N, a, b, y);
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
 }
 
 template <>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-		double* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_sub(N, a, b, y);
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
 }
 
 template <>
-void caffe_gpu_mul<float>(const int N, const float* a,
-		const float* b, float* y) {
-	kernel_mul(N, a, b, y);
+void caffe_gpu_mul<float>(const int N, const float* a, const float* b,
+    float* y) {
+  kernel_mul(N, a, b, y);
 }
 
 template <>
-void caffe_gpu_mul<double>(const int N, const double* a,
-		const double* b, double* y) {
-	kernel_mul(N, a, b, y);
+void caffe_gpu_mul<double>(const int N, const double* a, const double* b,
+    double* y) {
+  kernel_mul(N, a, b, y);
 }
 
 template <>
-void caffe_gpu_div<float>(const int N, const float* a,
-		const float* b, float* y) {
-	kernel_div(N, a, b, y);
+void caffe_gpu_div<float>(const int N, const float* a, const float* b,
+    float* y) {
+  kernel_div(N, a, b, y);
 }
 
 template <>
-void caffe_gpu_div<double>(const int N, const double* a,
-		const double* b, double* y) {
-	kernel_div(N, a, b, y);
+void caffe_gpu_div<double>(const int N, const double* a, const double* b,
+    double* y) {
+  kernel_div(N, a, b, y);
 }
 
 template <>
-void caffe_gpu_powx<float>(const int N, const float* a,
-		const float alpha, float* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_powx(N, a, alpha, y);
+void caffe_gpu_powx<float>(const int N, const float* a, const float alpha,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
 }
 
 template <>
-void caffe_gpu_powx<double>(const int N, const double* a,
-		const double alpha, double* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_powx(N, a, alpha, y);
+void caffe_gpu_powx<double>(const int N, const double* a, const double alpha,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
 }
 
-void popc_kernel(const int n, const float* a,
-		const float* b, uint8_t* y) {
+void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) {
 }
 
-void popcll_kernel(const int n, const double* a,
-		const double* b, uint8_t* y) {
+void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) {
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
-		const float* y) {
-	return 0;
+    const float* y) {
+  return 0;
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
-		const double* y) {
-	return 0;
+    const double* y) {
+  return 0;
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
@@ -890,116 +868,116 @@ void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
 
 template <>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-                                  float* r) {
-	caffe_gpu_uniform(r, n, a, b);	// r is a cl_mem object
+    float* r) {
+  caffe_gpu_uniform(r, n, a, b);	// r is a cl_mem object
 }
 template <>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-                                   double* r) {
-	caffe_gpu_uniform(r, n, a, b);  // r is a cl_mem object
+    double* r) {
+  caffe_gpu_uniform(r, n, a, b);  // r is a cl_mem object
 }
 
 template <>
-void caffe_gpu_rng_gaussian<float>(const int n, const float mu, const float sigma,
-                                  float* r) {
-	caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
+void caffe_gpu_rng_gaussian<float>(const int n, const float mu,
+    const float sigma, float* r) {
+  caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
 }
 
 template <>
-void caffe_gpu_rng_gaussian<double>(const int n, const double mu, const double sigma,
-                            double* r) {
-	caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
+void caffe_gpu_rng_gaussian<double>(const int n, const double mu,
+    const double sigma, double* r) {
+  caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
 }
 
 template <>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_log(N, a, y);
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
 }
 
 template <>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_log(N, a, y);
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
 }
 
 template <>
 void caffe_log<float>(const int n, const float* a, float* y) {
-	vsLn(n, a, y);
+  vsLn(n, a, y);
 }
 
 template <>
 void caffe_log<double>(const int n, const double* a, double* y) {
-	vdLn(n, a, y);
+  vdLn(n, a, y);
 }
 
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
-	if (X != Y) {
-		if (Caffe::mode() == Caffe::GPU) {
+  if (X != Y) {
+    if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
-			// NOLINT_NEXT_LINE(caffe/alt_fn)
-			//CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+      // NOLINT_NEXT_LINE(caffe/alt_fn)
+      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
 #else
-			NO_GPU;
+      NO_GPU;
 #endif
-		} else {
-			memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-		}
-	}
+    } else {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    }
+  }
 }
 
 template void caffe_copy<int>(const int N, const int* X, int* Y);
 template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-		unsigned int* Y);
+    unsigned int* Y);
 template void caffe_copy<float>(const int N, const float* X, float* Y);
 template void caffe_copy<double>(const int N, const double* X, double* Y);
 
 template <>
 void caffe_abs<float>(const int n, const float* a, float* y) {
-	vsAbs(n, a, y);
+  vsAbs(n, a, y);
 }
 
 template <>
 void caffe_abs<double>(const int n, const double* a, double* y) {
-	vdAbs(n, a, y);
+  vdAbs(n, a, y);
 }
 
 template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-		float* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_add(N, a, b, y);
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_add(N, a, b, y);
 }
 
 template <>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-		double* y) {
-	// NOLINT_NEXT_LINE(whitespace/operators)
-	kernel_add(N, a, b, y);
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_add(N, a, b, y);
 }
 
 template <>
 float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-		const float* y, const int incy) {
-	return cblas_sdot(n, x, incx, y, incy);
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
 }
 
 template <>
 double caffe_cpu_strided_dot<double>(const int n, const double* x,
-		const int incx, const double* y, const int incy) {
-	return cblas_ddot(n, x, incx, y, incy);
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
 }
 
 template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
-	if (alpha == 0) {
-		memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-		return;
-	}
-	for (int i = 0; i < N; ++i) {
-		Y[i] = alpha;
-	}
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
 }
 
 template void caffe_set<int>(const int N, const int alpha, int* Y);
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 64245bea..ae71de0f 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -14,150 +14,154 @@ namespace caffe {
 
 template <>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const float alpha, const float* A, const float* B, const float beta,
-		float* C) {
-	// Note that cublas follows fortran order.
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cublasOperation_t cuTransA =
-			(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	cublasOperation_t cuTransB =
-			(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-			N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  // Note that cublas follows fortran order.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(
+      cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+          B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
 void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
-		const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-		const double alpha, const double* A, const double* B, const double beta,
-		double* C) {
-	// Note that cublas follows fortran order.
-	int lda = (TransA == CblasNoTrans) ? K : M;
-	int ldb = (TransB == CblasNoTrans) ? N : K;
-	cublasOperation_t cuTransA =
-			(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	cublasOperation_t cuTransB =
-			(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-	CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-			N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  // Note that cublas follows fortran order.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(
+      cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+          B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const float alpha, const float* A, const float* x,
-		const float beta, float* y) {
-	cublasOperation_t cuTransA =
-			(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-			A, N, x, 1, &beta, y, 1));
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+      cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1,
+          &beta, y, 1));
 }
 
 template <>
 void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-		const int N, const double alpha, const double* A, const double* x,
-		const double beta, double* y) {
-	cublasOperation_t cuTransA =
-			(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-	CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-			A, N, x, 1, &beta, y, 1));
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+      cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1,
+          &beta, y, 1));
 }
 
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
-		float* Y) {
-	CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
+    float* Y) {
+  CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
 template <>
 void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
-		double* Y) {
-	CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
+    double* Y) {
+  CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
 void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) {
-	if (X != Y) {
-		CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault));  // NOLINT(caffe/alt_fn)
-	}
+  if (X != Y) {
+    CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault));  // NOLINT(caffe/alt_fn)
+  }
 }
 
 template <>
 void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
 template <>
 void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
 template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-		const float beta, float* Y) {
-	caffe_gpu_scal<float>(N, beta, Y);
-	caffe_gpu_axpy<float>(N, alpha, X, Y);
+    const float beta, float* Y) {
+  caffe_gpu_scal<float>(N, beta, Y);
+  caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
 template <>
 void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-		const double beta, double* Y) {
-	caffe_gpu_scal<double>(N, beta, Y);
-	caffe_gpu_axpy<double>(N, alpha, X, Y);
+    const double beta, double* Y) {
+  caffe_gpu_scal<double>(N, beta, Y);
+  caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
 template <>
 void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
-		float* out) {
-	CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+    float* out) {
+  CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 template <>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
-		double * out) {
-	CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
+    double * out) {
+  CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
 }
 
 template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
-	CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
+  CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
 template <>
 void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
-	CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
+  CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
 }
 
 template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-		float* y) {
-	CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-	CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+    float* y) {
+  CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
 template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-		double* y) {
-	CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
-	CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
+    double* y) {
+  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
 template <typename Dtype>
 __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
-	CUDA_KERNEL_LOOP(index, n) {
-		y[index] = alpha;
-	}
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = alpha;
+  }
 }
 
 template <typename Dtype>
 void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
-	if (alpha == 0) {
-		CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
-		return;
-	}
-	// NOLINT_NEXT_LINE(whitespace/operators)
+  if (alpha == 0) {
+    CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N));  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  // NOLINT_NEXT_LINE(whitespace/operators)
 set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-		N, alpha, Y);
+    N, alpha, Y);
 }
 
 template void caffe_gpu_set<int>(const int N, const int alpha, int* Y);
@@ -167,7 +171,7 @@ template void caffe_gpu_set<double>(const int N, const double alpha, double* Y);
 template <typename Dtype>
 __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
 CUDA_KERNEL_LOOP(index, n) {
-	y[index] += alpha;
+  y[index] += alpha;
 }
 }
 
@@ -175,7 +179,7 @@ template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
 // NOLINT_NEXT_LINE(whitespace/operators)
 add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-	N, alpha, Y);
+  N, alpha, Y);
 }
 
 template <>
@@ -242,16 +246,16 @@ y[index] = a[index] * b[index];
 }
 
 template <>
-void caffe_gpu_mul<float>(const int N, const float* a,
-const float* b, float* y) {
+void caffe_gpu_mul<float>(const int N, const float* a, const float* b,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
 template <>
-void caffe_gpu_mul<double>(const int N, const double* a,
-const double* b, double* y) {
+void caffe_gpu_mul<double>(const int N, const double* a, const double* b,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
@@ -266,16 +270,16 @@ y[index] = a[index] / b[index];
 }
 
 template <>
-void caffe_gpu_div<float>(const int N, const float* a,
-const float* b, float* y) {
+void caffe_gpu_div<float>(const int N, const float* a, const float* b,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
 }
 
 template <>
-void caffe_gpu_div<double>(const int N, const double* a,
-const double* b, double* y) {
+void caffe_gpu_div<double>(const int N, const double* a, const double* b,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, b, y);
@@ -353,16 +357,16 @@ y[index] = pow(a[index], alpha);
 }
 
 template <>
-void caffe_gpu_powx<float>(const int N, const float* a,
-const float alpha, float* y) {
+void caffe_gpu_powx<float>(const int N, const float* a, const float alpha,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, alpha, y);
 }
 
 template <>
-void caffe_gpu_powx<double>(const int N, const double* a,
-const double alpha, double* y) {
+void caffe_gpu_powx<double>(const int N, const double* a, const double alpha,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
 powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
 N, a, alpha, y);
@@ -372,21 +376,21 @@ DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
 - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
 
-__global__ void popc_kernel(const int n, const float* a,
-const float* b, uint8_t* y) {
+__global__ void popc_kernel(const int n, const float* a, const float* b,
+uint8_t* y) {
 CUDA_KERNEL_LOOP(index, n)
 {
-y[index] = __popc(static_cast<uint32_t>(a[index]) ^
-static_cast<uint32_t>(b[index]));
+y[index] = __popc(
+static_cast<uint32_t>(a[index]) ^ static_cast<uint32_t>(b[index]));
 }
 }
 
-__global__ void popcll_kernel(const int n, const double* a,
-const double* b, uint8_t* y) {
+__global__ void popcll_kernel(const int n, const double* a, const double* b,
+uint8_t* y) {
 CUDA_KERNEL_LOOP(index, n)
 {
-y[index] = __popcll(static_cast<uint64_t>(a[index]) ^
-static_cast<uint64_t>(b[index]));
+y[index] = __popcll(
+static_cast<uint64_t>(a[index]) ^ static_cast<uint64_t>(b[index]));
 }
 }
 
@@ -394,24 +398,24 @@ template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
 const float* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
-	// TestHammingDistanceGPU in test_math_functions.cpp).
+  // TestHammingDistanceGPU in test_math_functions.cpp).
 NOT_IMPLEMENTED;
 thrust::device_vector < uint8_t > popcounts(n);
-	// NOLINT_NEXT_LINE(whitespace/operators)
+  // NOLINT_NEXT_LINE(whitespace/operators)
 popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
 n, x, y, thrust::raw_pointer_cast(popcounts.data()));
-return thrust::reduce(popcounts.begin(), popcounts.end(),
-(uint32_t) 0, thrust::plus<uint32_t>());
+return thrust::reduce(popcounts.begin(), popcounts.end(), (uint32_t) 0,
+thrust::plus<uint32_t>());
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
 const double* y) {
-	// TODO: Fix caffe_gpu_hamming_distance (see failing unit test
-	// TestHammingDistanceGPU in test_math_functions.cpp).
+  // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
+  // TestHammingDistanceGPU in test_math_functions.cpp).
 NOT_IMPLEMENTED;
 thrust::device_vector < uint8_t > popcounts(n);
-	// NOLINT_NEXT_LINE(whitespace/operators)
+  // NOLINT_NEXT_LINE(whitespace/operators)
 popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
 n, x, y, thrust::raw_pointer_cast(popcounts.data()));
 return thrust::reduce(popcounts.begin(), popcounts.end(),
@@ -452,8 +456,7 @@ caffe_gpu_add_scalar(n, a, r);
 template <>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
 float* r) {
-CURAND_CHECK(
-curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
+CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
 template <>
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 8f44a106..6b5045d8 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -36,56 +36,56 @@ template <typename dtype> extern std::string get_dtype_suffix();
 
 template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
-	std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int err = 0;
-	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
-	err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value);
-	err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
-	OCL_CHECK(err);
+  std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int err = 0;
+  err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+  err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value);
+  err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+  OCL_CHECK(err);
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
 template void ocl_memset<int>(int* buffer, const int value, const int count);
 template void ocl_memset<float>(float* buffer, const float value,
-		const int count);
+    const int count);
 template void ocl_memset<double>(double* buffer, const double value,
-		const int count);
+    const int count);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
-		const int count) {
-	cl_int err;
-	err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
-	err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
-	err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
-	OCL_CHECK(err);
+    const int count) {
+  cl_int err;
+  err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+  err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
+  err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+  OCL_CHECK(err);
 
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 
 void eventCallback(cl_event event, cl_int event_status, void* user_data) {
-	cl_ulong ev_start_time = (cl_ulong) 0;
-	cl_ulong ev_end_time = (cl_ulong) 0;
-	double run_time;
-	OCL_CHECK(
-			clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
-					sizeof(cl_ulong), &ev_start_time, NULL));
-	OCL_CHECK(
-			clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
-					&ev_end_time, NULL));
-	run_time = (double) (ev_end_time - ev_start_time);
-	printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
+  cl_ulong ev_start_time = (cl_ulong) 0;
+  cl_ulong ev_end_time = (cl_ulong) 0;
+  double run_time;
+  OCL_CHECK(
+      clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+          sizeof(cl_ulong), &ev_start_time, NULL));
+  OCL_CHECK(
+      clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+          &ev_end_time, NULL));
+  run_time = (double) (ev_end_time - ev_start_time);
+  printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 75b69215..5844fb84 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -35,1899 +35,1898 @@
 namespace caffe {
 typedef unsigned int uint32_t;
 struct array4x32 {
-		uint32_t v[4];
+    uint32_t v[4];
 };
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
-		Dtype threshold) {
-	std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
-	cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-	static unsigned c = 0;
-	unsigned nrounds = 20;
-	array4x32 rndctr4;
-	rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-	cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-
-	cl_int ret;
-	ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
-	ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf);
-	ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup);
-	ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold);
-	ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds);
-	ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size);
-	OCL_CHECK(ret);
-
-	size_t globalws[1] = { size };
-	size_t localws[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
-					globalws,
-					localws, 0, NULL, NULL));
+    Dtype threshold) {
+  std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
+  cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+  static unsigned c = 0;
+  unsigned nrounds = 20;
+  array4x32 rndctr4;
+  rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+  cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+  cl_int ret;
+  ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+  ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf);
+  ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup);
+  ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold);
+  ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds);
+  ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size);
+  OCL_CHECK(ret);
+
+  size_t globalws[1] = { size };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+          globalws, localws, 0, NULL, NULL));
 }
 template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n,
-		float inf, float sup, float threshold);
+    float inf, float sup, float threshold);
 template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n,
-		double inf, double sup, double threshold);
+    double inf, double sup, double threshold);
 
 template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
-		const int M_, const int packing_num) {
-	std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) };
-	size_t uiLocal_Work_Size2[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+    const int M_, const int packing_num) {
+  std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) };
+  size_t uiLocal_Work_Size2[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
 }
 
 template void transform_gpu<float>(float* src, float* dst, const int top_offset,
-		const int N_, const int M_, const int packing_num);
+    const int N_, const int M_, const int packing_num);
 template void transform_gpu<double>(double* src, double* dst,
-		const int top_offset, const int N_, const int M_, const int packing_num);
+    const int top_offset, const int N_, const int M_, const int packing_num);
 
 template <typename Dtype>
 void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* bottom_data, Dtype* scale_data) {
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data));
+    const Dtype* bottom_data, Dtype* scale_data) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data));
 
-	size_t Global_Work_Size[1] = { (size_t) num };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim,
-		const float* bottom_data, float* scale_data);
+    const float* bottom_data, float* scale_data);
 template void get_max_gpu<double>(cl_kernel Kernel, const int num,
-		const int dim, const double* bottom_data, double* scale_data);
-template <typename Dtype>
-void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
-{
-        std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
-        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-        static unsigned c = 0;
-        unsigned nrounds = 20;
-        array4x32  rndctr4;
-        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&inf);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&sup);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
-        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
-        OCL_CHECK(ret);
-
-        size_t globalws[1] = {size};
-        size_t localws[1] = {256};
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
-}
-template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup);
-template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup);
-
-void caffe_gpu_uniform(const unsigned int n, unsigned int *r)
-{
-        std::string kernel_name = "PRNG_threefry4x32_uint_uniform";
-        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-        static unsigned c = 0;
-        unsigned nrounds = 20;
-        array4x32  rndctr4;
-        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-        
-        cl_uint inf = 0;
-        cl_uint sup = UINT_MAX;
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&r);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint),   (void*)&inf);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint),   (void*)&sup);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
-        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
-        OCL_CHECK(ret);
-
-        size_t globalws[1] = {size};
-        size_t localws[1] = {256};
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
-}
-
-template <typename Dtype>
-void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V)
-{
-        std::string kernel_name = "RNGGaussian" + get_dtype_suffix<Dtype>();
-        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
-
-        static unsigned c = 0;
-        unsigned nrounds = 20;
-        array4x32  rndctr4;
-        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
-        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
-
-        cl_int ret;
-        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
-        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
-        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&E);
-        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&V);
-        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
-        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
-        OCL_CHECK(ret);
-
-        size_t globalws[1] = {size};
-        size_t localws[1] = {256};
-        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
-}
-template void caffe_gpu_gaussian<float>(float* a, const unsigned int n, float E, float V);
-template void caffe_gpu_gaussian<double>(double* a, const unsigned int n, double E, double V);
+    const int dim, const double* bottom_data, double* scale_data);
+template <typename Dtype>
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup) {
+  std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
+  cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+  static unsigned c = 0;
+  unsigned nrounds = 20;
+  array4x32 rndctr4;
+  rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+  cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+  cl_int ret;
+  ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+  ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf);
+  ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup);
+  ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds);
+  ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size);
+  OCL_CHECK(ret);
+
+  size_t globalws[1] = { size };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+          globalws, localws, 0, NULL, NULL));
+}
+template void caffe_gpu_uniform<float>(float* a, const unsigned int n,
+    float inf, float sup);
+template void caffe_gpu_uniform<double>(double* a, const unsigned int n,
+    double inf, double sup);
+
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r) {
+  std::string kernel_name = "PRNG_threefry4x32_uint_uniform";
+  cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+  static unsigned c = 0;
+  unsigned nrounds = 20;
+  array4x32 rndctr4;
+  rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+  cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+  cl_uint inf = 0;
+  cl_uint sup = UINT_MAX;
+  cl_int ret;
+  ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &r);
+  ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+  ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint), (void*) &inf);
+  ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint), (void*) &sup);
+  ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds);
+  ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size);
+  OCL_CHECK(ret);
+
+  size_t globalws[1] = { size };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+          globalws, localws, 0, NULL, NULL));
+}
+
+template <typename Dtype>
+void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) {
+  std::string kernel_name = "RNGGaussian" + get_dtype_suffix<Dtype>();
+  cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+  static unsigned c = 0;
+  unsigned nrounds = 20;
+  array4x32 rndctr4;
+  rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+  cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+  cl_int ret;
+  ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+  ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &E);
+  ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &V);
+  ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*) &nrounds);
+  ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &size);
+  OCL_CHECK(ret);
+
+  size_t globalws[1] = { size };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+          globalws, localws, 0, NULL, NULL));
+}
+template void caffe_gpu_gaussian<float>(float* a, const unsigned int n, float E,
+    float V);
+template void caffe_gpu_gaussian<double>(double* a, const unsigned int n,
+    double E, double V);
 
 template <typename Dtype>
 void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) {
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) num };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data,
-		float* out);
+    float* out);
 template void exp_gpu<double>(cl_kernel Kernel, const int num,
-		const double* data, double* out);
+    const double* data, double* out);
 
 template <typename Dtype>
 void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* scale, Dtype* data) {
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+    const Dtype* scale, Dtype* data) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
 
-	size_t Global_Work_Size[1] = { (size_t)(num * dim) };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t)(num * dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void softmax_div_gpu<float>(cl_kernel Kernel, const int num,
-		const int dim, const float* scale, float* data);
+    const int dim, const float* scale, float* data);
 template void softmax_div_gpu<double>(cl_kernel Kernel, const int num,
-		const int dim, const double* scale, double* data);
+    const int dim, const double* scale, double* data);
 
 template <typename Dtype>
 Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
-		const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
+    const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL));
 
-	size_t globalws[1] = { 256 };
-	size_t localws[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
-					localws, 0, NULL, NULL));
-	void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE,
-			CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
-	Dtype loss = *(Dtype*) h_loss;
-	clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL,
-			NULL);
+  size_t globalws[1] = { 256 };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
+          localws, 0, NULL, NULL));
+  void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE,
+      CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
+  Dtype loss = *(Dtype*) h_loss;
+  clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL,
+      NULL);
 
-	return loss;
+  return loss;
 }
 
 template float softmax_gpu<float>(cl_kernel Kernel, const int num,
-		const int dim, const float* prob_data, const float* label, cl_mem d_loss);
+    const int dim, const float* prob_data, const float* label, cl_mem d_loss);
 template double softmax_gpu<double>(cl_kernel Kernel, const int num,
-		const int dim, const double* prob_data, const double* label, cl_mem d_loss);
+    const int dim, const double* prob_data, const double* label, cl_mem d_loss);
 
 template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
-		const int spatial_dim, const Dtype* data, Dtype* out) {
-	std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    const int spatial_dim, const Dtype* data, Dtype* out) {
+  std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_max<float>(const int num, const int channels,
-		const int spatial_dim, const float* data, float* out);
+    const int spatial_dim, const float* data, float* out);
 template void kernel_channel_max<double>(const int num, const int channels,
-		const int spatial_dim, const double* data, double* out);
+    const int spatial_dim, const double* data, double* out);
 
 template <typename Dtype>
-void kernel_channel_subtract(const int count,
-		const int num, const int channels,
-		const int spatial_dim, const Dtype* channel_max, Dtype* data) {
-	std::string kernel_name = "kernel_channel_subtract"
-			+ get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+void kernel_channel_subtract(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
+  std::string kernel_name = "kernel_channel_subtract"
+      + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
-template void kernel_channel_subtract<float>(const int count,
-		const int num, const int channels,
-		const int spatial_dim, const float* channel_max, float* data);
-template void kernel_channel_subtract<double>(const int count,
-		const int num, const int channels,
-		const int spatial_dim, const double* channel_max, double* data);
+template void kernel_channel_subtract<float>(const int count, const int num,
+    const int channels, const int spatial_dim, const float* channel_max,
+    float* data);
+template void kernel_channel_subtract<double>(const int count, const int num,
+    const int channels, const int spatial_dim, const double* channel_max,
+    double* data);
 
 template <typename Dtype>
-void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out)
-		{
-	std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
+  std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_mul<float>(const int count, const float* a, const float* b,
-		float* out);
+    float* out);
 template void kernel_mul<double>(const int count, const double* a,
-		const double* b, double* out);
+    const double* b, double* out);
 
 template <typename Dtype>
 void kernel_add_scalar(const int count, const Dtype data, Dtype* out) {
-	std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_add_scalar<float>(const int count, const float data,
-		float* out);
+    float* out);
 template void kernel_add_scalar<double>(const int count, const double data,
-		double* out);
+    double* out);
 
 template <typename Dtype>
 void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
-		Dtype* out) {
-	std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    Dtype* out) {
+  std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_powx<float>(const int count, const float* data,
-		const float alpha, float* out);
+    const float alpha, float* out);
 template void kernel_powx<double>(const int count, const double* data,
-		const double alpha, double* out);
+    const double alpha, double* out);
 
 template <typename Dtype>
 void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
-	std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_div<float>(const int count, const float* a, const float* b,
-		float* out);
+    float* out);
 template void kernel_div<double>(const int count, const double* a,
-		const double* b, double* out);
+    const double* b, double* out);
 
 template <typename Dtype>
 void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
-	std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_add<float>(const int count, const float* a, const float* b,
-		float* out);
+    float* out);
 template void kernel_add<double>(const int count, const double* a,
-		const double* b, double* out);
+    const double* b, double* out);
 
 template <typename Dtype>
 void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
-	std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_sub<float>(const int count, const float* a, const float* b,
-		float* out);
+    float* out);
 template void kernel_sub<double>(const int count, const double* a,
-		const double* b, double* out);
+    const double* b, double* out);
 
 template <typename Dtype>
 void kernel_log(const int count, const Dtype* data, Dtype* out) {
-	std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_log<float>(const int count, const float* data, float* out);
 template void kernel_log<double>(const int count, const double* data,
-		double* out);
+    double* out);
 
 template <typename Dtype>
 void kernel_exp(const int count, const Dtype* data, Dtype* out) {
-	std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_exp<float>(const int count, const float* data, float* out);
 template void kernel_exp<double>(const int count, const double* data,
-		double* out);
+    double* out);
 
 template <typename Dtype>
 void kernel_channel_sum(const int num, const int channels,
-		const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
-	std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
+  std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
 
-	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_sum<float>(const int num, const int channels,
-		const int spatial_dim, const float* data, float* channel_sum);
+    const int spatial_dim, const float* data, float* channel_sum);
 template void kernel_channel_sum<double>(const int num, const int channels,
-		const int spatial_dim, const double* data, double* channel_sum);
+    const int spatial_dim, const double* data, double* channel_sum);
 
 template <typename Dtype>
 void kernel_channel_div(const int count, const int num, const int channels,
-		const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
-	std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
+  std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
 
-	size_t Global_Work_Size[1] = { (size_t) count };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_div<float>(const int count, const int num,
-		const int channels,
-		const int spatial_dim, const float* channel_sum, float* data);
+    const int channels, const int spatial_dim, const float* channel_sum,
+    float* data);
 template void kernel_channel_div<double>(const int count, const int num,
-		const int channels,
-		const int spatial_dim, const double* channel_sum, double* data);
+    const int channels, const int spatial_dim, const double* channel_sum,
+    double* data);
 
 template <typename Dtype>
 void kernel_channel_dot(const int num, const int channels,
-		const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-		Dtype* channel_dot) {
-	std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot) {
+  std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot));
 
-	size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void kernel_channel_dot<float>(const int num, const int channels,
-		const int spatial_dim, const float* data_1, const float* data_2,
-		float* channel_dot);
+    const int spatial_dim, const float* data_1, const float* data_2,
+    float* channel_dot);
 template void kernel_channel_dot<double>(const int num, const int channels,
-		const int spatial_dim, const double* data_1, const double* data_2,
-		double* channel_dot);
-
-template <typename Dtype>
-void SoftmaxLossForwardGPU(const int nthreads,
-		const Dtype* prob_data, const Dtype* label, Dtype* loss,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_,
-		Dtype* counts) {
-	std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-        
-        int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
-	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
-	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
-
-	size_t Global_Work_Size[1] = { (size_t) nthreads };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int spatial_dim, const double* data_1, const double* data_2,
+    double* channel_dot);
+
+template <typename Dtype>
+void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data,
+    const Dtype* label, Dtype* loss, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts) {
+  std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(
+      clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
+  OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+  OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+  size_t Global_Work_Size[1] = { (size_t) nthreads };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SoftmaxLossForwardGPU<float>(const int nthreads,
-		const float* prob_data, const float* label, float* loss,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_, float* counts);
+    const float* prob_data, const float* label, float* loss, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, float* counts);
 template void SoftmaxLossForwardGPU<double>(const int nthreads,
-		const double* prob_data, const double* label, double* loss,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_, double* counts);
+    const double* prob_data, const double* label, double* loss, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, double* counts);
 
 template <typename Dtype>
 void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-		const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-		const int spatial_dim, const bool has_ignore_label_,
-		const int ignore_label_, Dtype* counts) {
-	std::string kernel_name = "SoftmaxLossBackwardGPU"
-			+ get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-        int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
-
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff));
-	OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
-	OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
-	OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
-
-	size_t Global_Work_Size[1] = { (size_t) nthreads };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts) {
+  std::string kernel_name = "SoftmaxLossBackwardGPU"
+      + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(
+      clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
+  OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+  OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+  size_t Global_Work_Size[1] = { (size_t) nthreads };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SoftmaxLossBackwardGPU<float>(const int nthreads,
-		const float* top, const float* label, float* bottom_diff,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_, float* counts);
+    const float* top, const float* label, float* bottom_diff, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, float* counts);
 template void SoftmaxLossBackwardGPU<double>(const int nthreads,
-		const double* top, const double* label, double* bottom_diff,
-		const int num, const int dim, const int spatial_dim,
-		const bool has_ignore_label_, const int ignore_label_, double* counts);
+    const double* top, const double* label, double* bottom_diff, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, double* counts);
 
 template <typename Dtype>
 void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) {
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
 
-	size_t Global_Work_Size[1] = { (size_t) num };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void scal_gpu<float>(cl_kernel Kernel, const int num,
-		const float alpha, float* data);
+    const float alpha, float* data);
 template void scal_gpu<double>(cl_kernel Kernel, const int num,
-		const double alpha, double* data);
+    const double alpha, double* data);
 
 template <typename Dtype>
 void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data,
-		const Dtype* label) {
-	OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
-	OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
-	OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
-	OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label));
+    const Dtype* label) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label));
 
-	size_t Global_Work_Size[1] = { (size_t) num };
-	size_t Local_Work_Size[1] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim,
-		float* data, const float* label);
+    float* data, const float* label);
 template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim,
-		double* data, const double* label);
+    double* data, const double* label);
 
 template <typename Dtype>
 void max_pool_fp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		Dtype* top_data) {
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    Dtype* top_data) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
-		const float* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		float* top_data);
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    float* top_data);
 template void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
-		const double* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		double* top_data);
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    double* top_data);
 
 template <typename Dtype>
 void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
-		const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
-		Dtype* top_mask) {
-	std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask);
-	ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+    Dtype* top_mask) {
+  std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void MaxPoolForward<float>(const int count, const float* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, float* top_data, int* mask,
-		float* top_mask);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, float* top_data, int* mask,
+    float* top_mask);
 template void MaxPoolForward<double>(const int count, const double* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, double* top_data, int* mask,
-		double* top_mask);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, double* top_data, int* mask,
+    double* top_mask);
 
 template <typename Dtype>
 void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		Dtype* idx_data, Dtype* top_data) {
-	std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* idx_data, Dtype* top_data) {
+  std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void StoPoolForwardTrain<float>(const int count,
-		const float* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_h_, const int kernel_w_,
-		const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
 template void StoPoolForwardTrain<double>(const int count,
-		const double* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_h_, const int kernel_w_,
-		const int stride_h_, const int stride_w_, double* idx_data,
-		double* top_data);
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, double* idx_data,
+    double* top_data);
 
 template <typename Dtype>
 void StoPoolForwardTest(const int count, const Dtype* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		Dtype* top_data) {
-	std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* top_data) {
+  std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 
 }
 template void StoPoolForwardTest<float>(const int count,
-		const float* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_h_, const int kernel_w_,
-		const int stride_h_, const int stride_w_, float* top_data);
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, float* top_data);
 template void StoPoolForwardTest<double>(const int count,
-		const double* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_h_, const int kernel_w_,
-		const int stride_h_, const int stride_w_, double* top_data);
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, double* top_data);
 
 template <typename Dtype>
 void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
-		const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, Dtype* top_data) {
-	std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data) {
+  std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void AvePoolForward<float>(const int count, const float* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, float* top_data);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, float* top_data);
 template void AvePoolForward<double>(const int count, const double* bottom_data,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_h_,
-		const int kernel_w_, const int stride_h_, const int stride_w_,
-		const int pad_h_, const int pad_w_, double* top_data);
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, double* top_data);
 
 template <typename Dtype>
 void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, Dtype* top_data) {
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, Dtype* top_data) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
-		const float* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, float* top_data);
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, float* top_data);
 template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
-		const double* bottom_data, const int clnum, const int channels_,
-		const int height_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, double* top_data);
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, double* top_data);
 
 template <typename Dtype>
 void max_pool_bp_gpu(cl_kernel Kernel, const int count,
-		const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, Dtype* bottom_diff) {
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, Dtype* bottom_diff) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
-		const float* bottom_data, const float* top_data, const float* top_diff,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, float* bottom_diff);
+    const float* bottom_data, const float* top_data, const float* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, float* bottom_diff);
 template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
-		const double* bottom_data, const double* top_data, const double* top_diff,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, double* bottom_diff);
+    const double* bottom_data, const double* top_data, const double* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, double* bottom_diff);
 
 template <typename Dtype>
 void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-		const int* const mask, const Dtype* const top_mask, const int num,
-		const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-		const int pad_w, Dtype* const bottom_diff) {
-	std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h);
-	ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w);
-	ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int* const mask, const Dtype* const top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void MaxPoolBackward<float>(const int nthreads,
-		const float* const top_diff, const int* const mask,
-		const float* const top_mask, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-		float* const bottom_diff);
+    const float* const top_diff, const int* const mask,
+    const float* const top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    float* const bottom_diff);
 template void MaxPoolBackward<double>(const int nthreads,
-		const double* const top_diff, const int* const mask,
-		const double* const top_mask, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-		double* const bottom_diff);
+    const double* const top_diff, const int* const mask,
+    const double* const top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    double* const bottom_diff);
 
 template <typename Dtype>
 void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-		const int num, const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-		const int pad_w, Dtype* const bottom_diff) {
-	std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w);
-	ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void AvePoolBackward<float>(const int nthreads,
-		const float* const top_diff, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-		float* const bottom_diff);
+    const float* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    float* const bottom_diff);
 template void AvePoolBackward<double>(const int nthreads,
-		const double* const top_diff, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-		double* const bottom_diff);
+    const double* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    double* const bottom_diff);
 
 template <typename Dtype>
 void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
-		const Dtype* const top_diff, const int num, const int channels,
-		const int height, const int width, const int pooled_height,
-		const int pooled_width, const int kernel_h, const int kernel_w,
-		const int stride_h, const int stride_w, Dtype* const bottom_diff) {
-	std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h);
-	ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w);
-	ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void StoPoolBackward<float>(const int nthreads,
-		const float* const rand_idx, const float* const top_diff, const int num,
-		const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w,
-		float* const bottom_diff);
+    const float* const rand_idx, const float* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    float* const bottom_diff);
 template void StoPoolBackward<double>(const int nthreads,
-		const double* const rand_idx, const double* const top_diff, const int num,
-		const int channels, const int height, const int width,
-		const int pooled_height, const int pooled_width, const int kernel_h,
-		const int kernel_w, const int stride_h, const int stride_w,
-		double* const bottom_diff);
+    const double* const rand_idx, const double* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    double* const bottom_diff);
 
 template <typename Dtype>
 void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
-		const int clnum, const int channels_, const int height_, const int width_,
-		const int pooled_height_, const int pooled_width_, const int kernel_size_,
-		const int stride_, const int pad_, Dtype* bottom_diff) {
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
-	ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
-	ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
-	ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, const int pad_, Dtype* bottom_diff) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
-		const float* top_diff, const int clnum, const int channels_,
-		const int intheight_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, float* bottom_diff);
+    const float* top_diff, const int clnum, const int channels_,
+    const int intheight_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, float* bottom_diff);
 template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
-		const double* top_diff, const int clnum, const int channels_,
-		const int intheight_, const int width_, const int pooled_height_,
-		const int pooled_width_, const int kernel_size_, const int stride_,
-		const int pad_, double* bottom_diff);
+    const double* top_diff, const int clnum, const int channels_,
+    const int intheight_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, double* bottom_diff);
 
 template <typename Dtype>
 void PReLUForward(const int count, const int channels, const int dim,
-		const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
-		const int div_factor) {
-	std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+    const int div_factor) {
+  std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUForward<float>(const int count, const int channels,
-		const int dim, const float* bottom_data, float* top_data,
-		const float* slope_data, const int div_factor);
+    const int dim, const float* bottom_data, float* top_data,
+    const float* slope_data, const int div_factor);
 template void PReLUForward<double>(const int count, const int channels,
-		const int dim, const double* bottom_data, double* top_data,
-		const double* slope_data, const int div_factor);
+    const int dim, const double* bottom_data, double* top_data,
+    const double* slope_data, const int div_factor);
 
 template <typename Dtype>
 void PReLUBackward(const int count, const int channels, const int dim,
-		const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
-		const Dtype* slope_data, const int div_factor) {
-	std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+    const Dtype* slope_data, const int div_factor) {
+  std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUBackward<float>(const int count, const int channels,
-		const int dim, const float* top_diff, const float* bottom_data,
-		float* bottom_diff, const float* slope_data, const int div_factor);
+    const int dim, const float* top_diff, const float* bottom_data,
+    float* bottom_diff, const float* slope_data, const int div_factor);
 template void PReLUBackward<double>(const int count, const int channels,
-		const int dim, const double* top_diff, const double* bottom_data,
-		double* bottom_diff, const double* slope_data, const int div_factor);
+    const int dim, const double* top_diff, const double* bottom_data,
+    double* bottom_diff, const double* slope_data, const int div_factor);
 
 template <typename Dtype>
 void PReLUParamBackward(const int count, const Dtype* top_diff,
-		const int offset_out, const Dtype* bottom_data, const int offset_in,
-		Dtype* bottom_diff) {
-	std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
-	ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int offset_out, const Dtype* bottom_data, const int offset_in,
+    Dtype* bottom_diff) {
+  std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+  ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void PReLUParamBackward<float>(const int count, const float* top_diff,
-		const int offset_out, const float* bottom_data, const int offset_in,
-		float* bottom_diff);
+    const int offset_out, const float* bottom_data, const int offset_in,
+    float* bottom_diff);
 template void PReLUParamBackward<double>(const int count,
-		const double* top_diff, const int offset_out, const double* bottom_data,
-		const int offset_in, double* bottom_diff);
+    const double* top_diff, const int offset_out, const double* bottom_data,
+    const int offset_in, double* bottom_diff);
 
 template <typename Dtype>
 void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
-		Dtype negative_slope) {
-	std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    Dtype negative_slope) {
+  std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void ReLUForward<float>(const int count, const float* bottom_data,
-		float* top_data, float negative_slope);
+    float* top_data, float negative_slope);
 template void ReLUForward<double>(const int count, const double* bottom_data,
-		double* top_data, double negative_slope);
+    double* top_data, double negative_slope);
 
 template <typename Dtype>
 void ReLUBackward(const int count, const Dtype* top_diff,
-		const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
-	std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
+  std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void ReLUBackward<float>(const int count, const float* top_diff,
-		const float* bottom_data, float* bottom_diff, float negative_slope);
+    const float* bottom_data, float* bottom_diff, float negative_slope);
 template void ReLUBackward<double>(const int count, const double* top_diff,
-		const double* bottom_data, double* bottom_diff, double negative_slope);
+    const double* bottom_data, double* bottom_diff, double negative_slope);
 
 template <typename Dtype>
 void SigmoidForward(const int count, const Dtype* bottom_data,
-		Dtype* top_data) {
-	std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    Dtype* top_data) {
+  std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void SigmoidForward<float>(const int count, const float* bottom_data,
-		float* top_data);
+    float* top_data);
 template void SigmoidForward<double>(const int count, const double* bottom_data,
-		double* top_data);
+    double* top_data);
 
 template <typename Dtype>
 void SigmoidBackward(const int count, const Dtype* top_diff,
-		const Dtype* top_data, Dtype* bottom_diff) {
-	std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const Dtype* top_data, Dtype* bottom_diff) {
+  std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void SigmoidBackward<float>(const int count, const float* top_diff,
-		const float* top_data, float* bottom_diff);
+    const float* top_data, float* bottom_diff);
 template void SigmoidBackward<double>(const int count, const double* top_diff,
-		const double* top_data, double* bottom_diff);
+    const double* top_data, double* bottom_diff);
 
 template <typename Dtype>
 void ThresholdForward(const int count, const Dtype threshold,
-		const Dtype* bottom_data, Dtype* top_data) {
-	std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, Dtype* top_data) {
+  std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void ThresholdForward<float>(const int count, const float threshold,
-		const float* bottom_data, float* top_data);
+    const float* bottom_data, float* top_data);
 template void ThresholdForward<double>(const int count, const double threshold,
-		const double* bottom_data, double* top_data);
+    const double* bottom_data, double* top_data);
 
 template <typename Dtype>
 void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) {
-	std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void TanHForward<float>(const int count, const float* bottom_data,
-		float* top_data);
+    float* top_data);
 template void TanHForward<double>(const int count, const double* bottom_data,
-		double* top_data);
+    double* top_data);
 
 template <typename Dtype>
 void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
-		Dtype* bottom_diff) {
-	std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) count };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    Dtype* bottom_diff) {
+  std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void TanHBackward<float>(const int count, const float* top_diff,
-		const float* top_data, float* bottom_diff);
+    const float* top_data, float* bottom_diff);
 template void TanHBackward<double>(const int count, const double* top_diff,
-		const double* top_data, double* bottom_diff);
+    const double* top_data, double* bottom_diff);
 
 template <typename Dtype>
 void opttrans(const Dtype* data_im, const int im_offset, const int channels,
-		const int height, const int width, Dtype* data_opt, const int opt_offset,
-		const int optnum) {
-	std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	int num_kernels = channels * height * width * optnum;
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
-	ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
-	ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
-	OCL_CHECK(ret);
-
-	size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int height, const int width, Dtype* data_opt, const int opt_offset,
+    const int optnum) {
+  std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int num_kernels = channels * height * width * optnum;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 
 template void opttrans<float>(const float* data_im, const int im_offset,
-		const int channels,
-		const int height, const int width, float* data_opt, const int opt_offset,
-		const int optnum);
+    const int channels, const int height, const int width, float* data_opt,
+    const int opt_offset, const int optnum);
 template void opttrans<double>(const double* data_im, const int im_offset,
-		const int channels,
-		const int height, const int width, double* data_opt, const int opt_offset,
-		const int optnum);
-
-template <typename Dtype>
-void LRNFillScale(const int nthreads, const Dtype* const in,
-		const int num, const int channels, const int height,
-		const int width, const int size, const Dtype alpha_over_size,
-		const Dtype k, Dtype* const scale) {
-	std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
-	cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
-	ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
-	ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
-	ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
-	ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    const int channels, const int height, const int width, double* data_opt,
+    const int opt_offset, const int optnum);
+
+template <typename Dtype>
+void LRNFillScale(const int nthreads, const Dtype* const in, const int num,
+    const int channels, const int height, const int width, const int size,
+    const Dtype alpha_over_size, const Dtype k, Dtype* const scale) {
+  std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
+  cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
+  ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
+  ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
+  ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
+  ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void LRNFillScale<float>(const int nthreads, const float* const in,
-		const int num, const int channels, const int height,
-		const int width, const int size, const float alpha_over_size,
-		const float k, float* const scale);
+    const int num, const int channels, const int height, const int width,
+    const int size, const float alpha_over_size, const float k,
+    float* const scale);
 template void LRNFillScale<double>(const int nthreads, const double* const in,
-		const int num, const int channels, const int height,
-		const int width, const int size, const double alpha_over_size,
-		const double k, double* const scale);
-
-template <typename Dtype>
-void LRNComputeOutput(int nthreads, const Dtype* in,
-		Dtype* scale, Dtype negative_beta, Dtype* out) {
-	std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
-	cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
-	ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
-	ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
-	ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size2[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
-					uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+    const int num, const int channels, const int height, const int width,
+    const int size, const double alpha_over_size, const double k,
+    double* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale,
+    Dtype negative_beta, Dtype* out) {
+  std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
+  cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
+  ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
+  ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
+  ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size2[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
+          uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
 }
 template void LRNComputeOutput<float>(int nthreads, const float* in,
-		float* scale, float negative_beta, float* out);
+    float* scale, float negative_beta, float* out);
 template void LRNComputeOutput<double>(int nthreads, const double* in,
-		double* scale, double negative_beta, double* out);
-
-template <typename Dtype>
-void LRNComputeDiff(const int nthreads,
-		const Dtype* const bottom_data, const Dtype* const top_data,
-		const Dtype* const scale, const Dtype* const top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int size, const Dtype negative_beta,
-		const Dtype cache_ratio, Dtype* const bottom_diff) {
-	std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
-	cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
-	ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
-	ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
-	ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
-	ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
-	ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
-	ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
-	ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-	size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
-	size_t uiLocal_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
-					uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+    double* scale, double negative_beta, double* out);
+
+template <typename Dtype>
+void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data,
+    const Dtype* const top_data, const Dtype* const scale,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int size,
+    const Dtype negative_beta, const Dtype cache_ratio,
+    Dtype* const bottom_diff) {
+  std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
+  cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
+  ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
+  ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
+  ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
+  ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
 }
 template void LRNComputeDiff<float>(const int nthreads,
-		const float* const bottom_data, const float* const top_data,
-		const float* const scale, const float* const top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int size, const float negative_beta,
-		const float cache_ratio, float* const bottom_diff);
+    const float* const bottom_data, const float* const top_data,
+    const float* const scale, const float* const top_diff, const int num,
+    const int channels, const int height, const int width, const int size,
+    const float negative_beta, const float cache_ratio,
+    float* const bottom_diff);
 template void LRNComputeDiff<double>(const int nthreads,
-		const double* const bottom_data, const double* const top_data,
-		const double* const scale, const double* const top_diff,
-		const int num, const int channels, const int height,
-		const int width, const int size, const double negative_beta,
-		const double cache_ratio, double* const bottom_diff);
+    const double* const bottom_data, const double* const top_data,
+    const double* const scale, const double* const top_diff, const int num,
+    const int channels, const int height, const int width, const int size,
+    const double negative_beta, const double cache_ratio,
+    double* const bottom_diff);
 
 template <typename Dtype>
 void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
-	std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_add<float>(const int n, const float* in1,
-		const float* in2, float* y);
+    const float* in2, float* y);
 template void caffe_gpu_add<double>(const int n, const double* in1,
-		const double* in2, double* y);
+    const double* in2, double* y);
 
 template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
-	std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) N };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
 template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
-		double* Y);
+    double* Y);
 
 template <typename Dtype>
 void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
-	std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) N };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_abs_ocl<float>(const int N, const float* X, float* Y);
 template void caffe_gpu_abs_ocl<double>(const int N, const double* X,
-		double* Y);
+    double* Y);
 
 template <typename Dtype>
 void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
-	std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_div<float>(const int n, const float* a, const float* b,
-		float* y);
+    float* y);
 template void caffe_gpu_div<double>(const int n, const double* a,
-		const double* b, double* y);
+    const double* b, double* y);
 
 template <typename Dtype>
 void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) {
-	std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_add_scalar<float>(const int n, const float alpha,
-		float* top_data);
+    float* top_data);
 template void caffe_gpu_add_scalar<double>(const int n, const double alpha,
-		double* top_data);
+    double* top_data);
 
 template <typename Dtype>
 void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
-	std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_mul<float>(const int n, const float* a, const float* b,
-		float* y);
+    float* y);
 template void caffe_gpu_mul<double>(const int n, const double* a,
-		const double* b, double* y);
+    const double* b, double* y);
 
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) {
-	std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
-	cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-	cl_int ret;
-	ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
-	ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
-	ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
-	ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
-	OCL_CHECK(ret);
-	size_t Global_Work_Size[] = { (size_t) n };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void caffe_gpu_powx<float>(const int n, const float* a,
-		const float alpha, float* y);
+    const float alpha, float* y);
 template void caffe_gpu_powx<double>(const int n, const double* a,
-		const double alpha, double* y);
+    const double alpha, double* y);
 
 template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data,
-		const int* MaskMem, const Dtype scale_, Dtype* top_data) {
-	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+    const int* MaskMem, const Dtype scale_, Dtype* top_data) {
+  std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+  ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
 
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void DropoutForward<float>(const int count, const float* bottom_data,
-		const int* MaskMem, const float scale_, float* top_data);
+    const int* MaskMem, const float scale_, float* top_data);
 template void DropoutForward<double>(const int count, const double* bottom_data,
-		const int* MaskMem, const double scale_, double* top_data);
+    const int* MaskMem, const double scale_, double* top_data);
 
 template <typename Dtype>
 void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-		const float threshold_, const Dtype scale_, Dtype* bottom_diff) {
-	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const float threshold_, const Dtype scale_, Dtype* bottom_diff) {
+  std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
+  ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void DropoutBackward<float>(const int count, const float* top_diff,
-		const int* MaskMem, const float threshold_, const float scale_,
-		float* bottom_diff);
+    const int* MaskMem, const float threshold_, const float scale_,
+    float* bottom_diff);
 template void DropoutBackward<double>(const int count, const double* top_diff,
-		const int* MaskMem, const float threshold_, const double scale_,
-		double* bottom_diff);
+    const int* MaskMem, const float threshold_, const double scale_,
+    double* bottom_diff);
 
 template <typename Dtype>
 void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) {
-	std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+  std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
-	OCL_CHECK(ret);
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
 
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void BNLLForward<float>(const int count, const float* bottom_data,
-		float *top_data);
+    float *top_data);
 template void BNLLForward<double>(const int count, const double* bottom_data,
-		double *top_data);
+    double *top_data);
 
 template <typename Dtype>
 void BNLLBackward(const int count, const Dtype* top_diff,
-		const Dtype* bottom_data, Dtype *bottom_diff) {
-	std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data, Dtype *bottom_diff) {
+  std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void BNLLBackward<float>(const int count, const float* top_diff,
-		const float* bottom_data, float *bottom_diff);
+    const float* bottom_data, float *bottom_diff);
 template void BNLLBackward<double>(const int count, const double* top_diff,
-		const double* bottom_data, double *bottom_diff);
+    const double* bottom_data, double *bottom_diff);
 
 template <typename Dtype>
 void Concat(const int nthreads, const Dtype* in_data, const bool forward,
-		const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, Dtype *out_data) {
-	std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-	int k_forward = (forward == true) ? 1 : 0;
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
-	ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
-	ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
-	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) nthreads };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const int num_concats, const int concat_size, const int top_concat_axis,
+    const int bottom_concat_axis, const int offset_concat_axis,
+    Dtype *out_data) {
+  std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+  int k_forward = (forward == true) ? 1 : 0;
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void Concat<float>(const int nthreads, const float* in_data,
-		const bool forward, const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, float *out_data);
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, float *out_data);
 template void Concat<double>(const int nthreads, const double* in_data,
-		const bool forward, const int num_concats, const int concat_size,
-		const int top_concat_axis, const int bottom_concat_axis,
-		const int offset_concat_axis, double *out_data);
-
-template <typename Dtype>
-void CLLBackward(const int count, const int channels,
-		const Dtype margin, const bool legacy_version, const Dtype alpha,
-		const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-		Dtype *bottom_diff) {
-	std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels);
-	ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version);
-	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y);
-	ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff);
-	ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq);
-	ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) count };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, double *out_data);
+
+template <typename Dtype>
+void CLLBackward(const int count, const int channels, const Dtype margin,
+    const bool legacy_version, const Dtype alpha, const Dtype* y,
+    const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff) {
+  std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version);
+  ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void CLLBackward<float>(const int count, const int channels,
-		const float margin, const bool legacy_version, const float alpha,
-		const float* y, const float* diff, const float* dist_sq,
-		float *bottom_diff);
+    const float margin, const bool legacy_version, const float alpha,
+    const float* y, const float* diff, const float* dist_sq,
+    float *bottom_diff);
 template void CLLBackward<double>(const int count, const int channels,
-		const double margin, const bool legacy_version, const double alpha,
-		const double* y, const double* diff, const double* dist_sq,
-		double *bottom_diff);
+    const double margin, const bool legacy_version, const double alpha,
+    const double* y, const double* diff, const double* dist_sq,
+    double *bottom_diff);
 
 template <typename Dtype>
 void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-		const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-		int* mask) {
-	std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
-	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) nthreads };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+    int* mask) {
+  std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void MaxForward<float>(const int nthreads, const float* bottom_data_a,
-		const float* bottom_data_b, const int blob_idx, float* top_data,
-		int* mask);
+    const float* bottom_data_b, const int blob_idx, float* top_data, int* mask);
 template void MaxForward<double>(const int nthreads,
-		const double* bottom_data_a,
-		const double* bottom_data_b, const int blob_idx, double* top_data,
-		int* mask);
-
-template <typename Dtype>
-void MaxBackward(const int nthreads, const Dtype* top_diff,
-		const int blob_idx, const int* mask, Dtype* bottom_diff) {
-	std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
-	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
-
-	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
-	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
-	ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff);
-	OCL_CHECK(ret);
-
-	size_t Global_Work_Size[] = { (size_t) nthreads };
-	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
-					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+    const double* bottom_data_a, const double* bottom_data_b,
+    const int blob_idx, double* top_data, int* mask);
+
+template <typename Dtype>
+void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
+    const int* mask, Dtype* bottom_diff) {
+  std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void MaxBackward<float>(const int nthreads, const float* top_diff,
-		const int blob_idx, const int* mask, float* bottom_diff);
+    const int blob_idx, const int* mask, float* bottom_diff);
 template void MaxBackward<double>(const int nthreads, const double* top_diff,
-		const int blob_idx, const int* mask, double* bottom_diff);
+    const int blob_idx, const int* mask, double* bottom_diff);
 
 template <typename Dtype>
 void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
-		int channel_in, int width, int height, int channel_out, int width_out,
-		int height_out, int kernel_w, int kernel_h, int stride, int pad,
-		int batch_sz) {
+    int channel_in, int width, int height, int channel_out, int width_out,
+    int height_out, int kernel_w, int kernel_h, int stride, int pad,
+    int batch_sz) {
 }
 template void ocl_conv<float>(float* bottom_data, float* top_data,
-		float* weights, float* bias, int channel_in, int width, int height,
-		int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
-		int stride, int pad, int batch_sz);
+    float* weights, float* bias, int channel_in, int width, int height,
+    int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+    int stride, int pad, int batch_sz);
 template void ocl_conv<double>(double* bottom_data, double* top_data,
-		double* weights, double* bias, int channel_in, int width, int height,
-		int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
-		int stride, int pad, int batch_sz);
+    double* weights, double* bias, int channel_in, int width, int height,
+    int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+    int stride, int pad, int batch_sz);
 
 }  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index da533cd9..028dd884 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -13,564 +13,562 @@
 namespace caffe {
 
 bool NetNeedsUpgrade(const NetParameter& net_param) {
-	return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param);
+  return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param);
 }
 
 bool NetNeedsV0ToV1Upgrade(const NetParameter& net_param) {
-	for (int i = 0; i < net_param.layers_size(); ++i) {
-		if (net_param.layers(i).has_layer()) {
-			return true;
-		}
-	}
-	return false;
+  for (int i = 0; i < net_param.layers_size(); ++i) {
+    if (net_param.layers(i).has_layer()) {
+      return true;
+    }
+  }
+  return false;
 }
 
 bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) {
-	return net_param.layers_size() > 0;
+  return net_param.layers_size() > 0;
 }
 
 bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
-		NetParameter* net_param) {
-	// First upgrade padding layers to padded conv layers.
-	NetParameter v0_net_param;
-	UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
-	// Now upgrade layer parameters.
-	bool is_fully_compatible = true;
-	net_param->Clear();
-	if (v0_net_param.has_name()) {
-		net_param->set_name(v0_net_param.name());
-	}
-	for (int i = 0; i < v0_net_param.layers_size(); ++i) {
-		is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
-				net_param->add_layers());
-	}
-	for (int i = 0; i < v0_net_param.input_size(); ++i) {
-		net_param->add_input(v0_net_param.input(i));
-	}
-	for (int i = 0; i < v0_net_param.input_dim_size(); ++i) {
-		net_param->add_input_dim(v0_net_param.input_dim(i));
-	}
-	if (v0_net_param.has_force_backward()) {
-		net_param->set_force_backward(v0_net_param.force_backward());
-	}
-	return is_fully_compatible;
+    NetParameter* net_param) {
+  // First upgrade padding layers to padded conv layers.
+  NetParameter v0_net_param;
+  UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
+  // Now upgrade layer parameters.
+  bool is_fully_compatible = true;
+  net_param->Clear();
+  if (v0_net_param.has_name()) {
+    net_param->set_name(v0_net_param.name());
+  }
+  for (int i = 0; i < v0_net_param.layers_size(); ++i) {
+    is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
+        net_param->add_layers());
+  }
+  for (int i = 0; i < v0_net_param.input_size(); ++i) {
+    net_param->add_input(v0_net_param.input(i));
+  }
+  for (int i = 0; i < v0_net_param.input_dim_size(); ++i) {
+    net_param->add_input_dim(v0_net_param.input_dim(i));
+  }
+  if (v0_net_param.has_force_backward()) {
+    net_param->set_force_backward(v0_net_param.force_backward());
+  }
+  return is_fully_compatible;
 }
 
 void UpgradeV0PaddingLayers(const NetParameter& param,
-		NetParameter* param_upgraded_pad) {
-	// Copy everything other than the layers from the original param.
-	param_upgraded_pad->Clear();
-	param_upgraded_pad->CopyFrom(param);
-	param_upgraded_pad->clear_layers();
-	// Figure out which layer each bottom blob comes from.
-	map<string, int> blob_name_to_last_top_idx;
-	for (int i = 0; i < param.input_size(); ++i) {
-		const string& blob_name = param.input(i);
-		blob_name_to_last_top_idx[blob_name] = -1;
-	}
-	for (int i = 0; i < param.layers_size(); ++i) {
-		const V1LayerParameter& layer_connection = param.layers(i);
-		const V0LayerParameter& layer_param = layer_connection.layer();
-		// Add the layer to the new net, unless it's a padding layer.
-		if (layer_param.type() != "padding") {
-			param_upgraded_pad->add_layers()->CopyFrom(layer_connection);
-		}
-		for (int j = 0; j < layer_connection.bottom_size(); ++j) {
-			const string& blob_name = layer_connection.bottom(j);
-			if (blob_name_to_last_top_idx.find(blob_name) ==
-					blob_name_to_last_top_idx.end()) {
-				LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
-			}
-			const int top_idx = blob_name_to_last_top_idx[blob_name];
-			if (top_idx == -1) {
-				continue;
-			}
-			const V1LayerParameter& source_layer = param.layers(top_idx);
-			if (source_layer.layer().type() == "padding") {
-				// This layer has a padding layer as input -- check that it is a conv
-				// layer or a pooling layer and takes only one input.  Also check that
-				// the padding layer input has only one input and one output.  Other
-				// cases have undefined behavior in Caffe.
-				CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
-						<< "Padding layer input to "
-								"non-convolutional / non-pooling layer type "
-						<< layer_param.type();
-				CHECK_EQ(layer_connection.bottom_size(), 1)
-						<< "Conv Layer takes a single blob as input.";
-				CHECK_EQ(source_layer.bottom_size(), 1)
-						<< "Padding Layer takes a single blob as input.";
-				CHECK_EQ(source_layer.top_size(), 1)
-						<< "Padding Layer produces a single blob as output.";
-				int layer_index = param_upgraded_pad->layers_size() - 1;
-				param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()
-						->set_pad(source_layer.layer().pad());
-				param_upgraded_pad->mutable_layers(layer_index)
-						->set_bottom(j, source_layer.bottom(0));
-			}
-		}
-		for (int j = 0; j < layer_connection.top_size(); ++j) {
-			const string& blob_name = layer_connection.top(j);
-			blob_name_to_last_top_idx[blob_name] = i;
-		}
-	}
+    NetParameter* param_upgraded_pad) {
+  // Copy everything other than the layers from the original param.
+  param_upgraded_pad->Clear();
+  param_upgraded_pad->CopyFrom(param);
+  param_upgraded_pad->clear_layers();
+  // Figure out which layer each bottom blob comes from.
+  map<string, int> blob_name_to_last_top_idx;
+  for (int i = 0; i < param.input_size(); ++i) {
+    const string& blob_name = param.input(i);
+    blob_name_to_last_top_idx[blob_name] = -1;
+  }
+  for (int i = 0; i < param.layers_size(); ++i) {
+    const V1LayerParameter& layer_connection = param.layers(i);
+    const V0LayerParameter& layer_param = layer_connection.layer();
+    // Add the layer to the new net, unless it's a padding layer.
+    if (layer_param.type() != "padding") {
+      param_upgraded_pad->add_layers()->CopyFrom(layer_connection);
+    }
+    for (int j = 0; j < layer_connection.bottom_size(); ++j) {
+      const string& blob_name = layer_connection.bottom(j);
+      if (blob_name_to_last_top_idx.find(blob_name)
+          == blob_name_to_last_top_idx.end()) {
+        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+      }
+      const int top_idx = blob_name_to_last_top_idx[blob_name];
+      if (top_idx == -1) {
+        continue;
+      }
+      const V1LayerParameter& source_layer = param.layers(top_idx);
+      if (source_layer.layer().type() == "padding") {
+        // This layer has a padding layer as input -- check that it is a conv
+        // layer or a pooling layer and takes only one input.  Also check that
+        // the padding layer input has only one input and one output.  Other
+        // cases have undefined behavior in Caffe.
+        CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
+            << "Padding layer input to "
+                "non-convolutional / non-pooling layer type "
+            << layer_param.type();
+        CHECK_EQ(layer_connection.bottom_size(), 1)
+            << "Conv Layer takes a single blob as input.";
+        CHECK_EQ(source_layer.bottom_size(), 1)
+            << "Padding Layer takes a single blob as input.";
+        CHECK_EQ(source_layer.top_size(), 1)
+            << "Padding Layer produces a single blob as output.";
+        int layer_index = param_upgraded_pad->layers_size() - 1;
+        param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()->set_pad(
+            source_layer.layer().pad());
+        param_upgraded_pad->mutable_layers(layer_index)->set_bottom(j,
+            source_layer.bottom(0));
+      }
+    }
+    for (int j = 0; j < layer_connection.top_size(); ++j) {
+      const string& blob_name = layer_connection.top(j);
+      blob_name_to_last_top_idx[blob_name] = i;
+    }
+  }
 }
 
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-		V1LayerParameter* layer_param) {
-	bool is_fully_compatible = true;
-	layer_param->Clear();
-	for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
-		layer_param->add_bottom(v0_layer_connection.bottom(i));
-	}
-	for (int i = 0; i < v0_layer_connection.top_size(); ++i) {
-		layer_param->add_top(v0_layer_connection.top(i));
-	}
-	if (v0_layer_connection.has_layer()) {
-		const V0LayerParameter& v0_layer_param = v0_layer_connection.layer();
-		if (v0_layer_param.has_name()) {
-			layer_param->set_name(v0_layer_param.name());
-		}
-		const string& type = v0_layer_param.type();
-		if (v0_layer_param.has_type()) {
-			layer_param->set_type(UpgradeV0LayerType(type));
-		}
-		for (int i = 0; i < v0_layer_param.blobs_size(); ++i) {
-			layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i));
-		}
-		for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) {
-			layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i));
-		}
-		for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) {
-			layer_param->add_weight_decay(v0_layer_param.weight_decay(i));
-		}
-		if (v0_layer_param.has_num_output()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_num_output(
-						v0_layer_param.num_output());
-			} else if (type == "innerproduct") {
-				layer_param->mutable_inner_product_param()->set_num_output(
-						v0_layer_param.num_output());
-			} else {
-				LOG(ERROR) << "Unknown parameter num_output for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_biasterm()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_bias_term(
-						v0_layer_param.biasterm());
-			} else if (type == "innerproduct") {
-				layer_param->mutable_inner_product_param()->set_bias_term(
-						v0_layer_param.biasterm());
-			} else {
-				LOG(ERROR) << "Unknown parameter biasterm for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_weight_filler()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->
-						mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
-			} else if (type == "innerproduct") {
-				layer_param->mutable_inner_product_param()->
-						mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
-			} else {
-				LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_bias_filler()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->
-						mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
-			} else if (type == "innerproduct") {
-				layer_param->mutable_inner_product_param()->
-						mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
-			} else {
-				LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_pad()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
-			} else if (type == "pool") {
-				layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
-			} else {
-				LOG(ERROR) << "Unknown parameter pad for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_kernelsize()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_kernel_size(
-						v0_layer_param.kernelsize());
-			} else if (type == "pool") {
-				layer_param->mutable_pooling_param()->set_kernel_size(
-						v0_layer_param.kernelsize());
-			} else {
-				LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_group()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_group(
-						v0_layer_param.group());
-			} else {
-				LOG(ERROR) << "Unknown parameter group for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_stride()) {
-			if (type == "conv") {
-				layer_param->mutable_convolution_param()->set_stride(
-						v0_layer_param.stride());
-			} else if (type == "pool") {
-				layer_param->mutable_pooling_param()->set_stride(
-						v0_layer_param.stride());
-			} else {
-				LOG(ERROR) << "Unknown parameter stride for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_pool()) {
-			if (type == "pool") {
-				V0LayerParameter_PoolMethod pool = v0_layer_param.pool();
-				switch (pool) {
-					case V0LayerParameter_PoolMethod_MAX:
-						layer_param->mutable_pooling_param()->set_pool(
-								PoolingParameter_PoolMethod_MAX);
-						break;
-					case V0LayerParameter_PoolMethod_AVE:
-						layer_param->mutable_pooling_param()->set_pool(
-								PoolingParameter_PoolMethod_AVE);
-						break;
-					case V0LayerParameter_PoolMethod_STOCHASTIC:
-						layer_param->mutable_pooling_param()->set_pool(
-								PoolingParameter_PoolMethod_STOCHASTIC);
-						break;
-					default:
-						LOG(ERROR) << "Unknown pool method " << pool;
-						is_fully_compatible = false;
-				}
-			} else {
-				LOG(ERROR) << "Unknown parameter pool for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_dropout_ratio()) {
-			if (type == "dropout") {
-				layer_param->mutable_dropout_param()->set_dropout_ratio(
-						v0_layer_param.dropout_ratio());
-			} else {
-				LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_local_size()) {
-			if (type == "lrn") {
-				layer_param->mutable_lrn_param()->set_local_size(
-						v0_layer_param.local_size());
-			} else {
-				LOG(ERROR) << "Unknown parameter local_size for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_alpha()) {
-			if (type == "lrn") {
-				layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha());
-			} else {
-				LOG(ERROR) << "Unknown parameter alpha for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_beta()) {
-			if (type == "lrn") {
-				layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta());
-			} else {
-				LOG(ERROR) << "Unknown parameter beta for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_k()) {
-			if (type == "lrn") {
-				layer_param->mutable_lrn_param()->set_k(v0_layer_param.k());
-			} else {
-				LOG(ERROR) << "Unknown parameter k for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_source()) {
-			if (type == "data") {
-				layer_param->mutable_data_param()->set_source(v0_layer_param.source());
-			} else if (type == "hdf5_data") {
-				layer_param->mutable_hdf5_data_param()->set_source(
-						v0_layer_param.source());
-			} else if (type == "images") {
-				layer_param->mutable_image_data_param()->set_source(
-						v0_layer_param.source());
-			} else if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_source(
-						v0_layer_param.source());
-			} else if (type == "infogain_loss") {
-				layer_param->mutable_infogain_loss_param()->set_source(
-						v0_layer_param.source());
-			} else {
-				LOG(ERROR) << "Unknown parameter source for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_scale()) {
-			layer_param->mutable_transform_param()->
-					set_scale(v0_layer_param.scale());
-		}
-		if (v0_layer_param.has_meanfile()) {
-			layer_param->mutable_transform_param()->
-					set_mean_file(v0_layer_param.meanfile());
-		}
-		if (v0_layer_param.has_batchsize()) {
-			if (type == "data") {
-				layer_param->mutable_data_param()->set_batch_size(
-						v0_layer_param.batchsize());
-			} else if (type == "hdf5_data") {
-				layer_param->mutable_hdf5_data_param()->set_batch_size(
-						v0_layer_param.batchsize());
-			} else if (type == "images") {
-				layer_param->mutable_image_data_param()->set_batch_size(
-						v0_layer_param.batchsize());
-			} else if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_batch_size(
-						v0_layer_param.batchsize());
-			} else {
-				LOG(ERROR) << "Unknown parameter batchsize for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_cropsize()) {
-			layer_param->mutable_transform_param()->
-					set_crop_size(v0_layer_param.cropsize());
-		}
-		if (v0_layer_param.has_mirror()) {
-			layer_param->mutable_transform_param()->
-					set_mirror(v0_layer_param.mirror());
-		}
-		if (v0_layer_param.has_rand_skip()) {
-			if (type == "data") {
-				layer_param->mutable_data_param()->set_rand_skip(
-						v0_layer_param.rand_skip());
-			} else if (type == "images") {
-				layer_param->mutable_image_data_param()->set_rand_skip(
-						v0_layer_param.rand_skip());
-			} else {
-				LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_shuffle_images()) {
-			if (type == "images") {
-				layer_param->mutable_image_data_param()->set_shuffle(
-						v0_layer_param.shuffle_images());
-			} else {
-				LOG(ERROR) << "Unknown parameter shuffle for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_new_height()) {
-			if (type == "images") {
-				layer_param->mutable_image_data_param()->set_new_height(
-						v0_layer_param.new_height());
-			} else {
-				LOG(ERROR) << "Unknown parameter new_height for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_new_width()) {
-			if (type == "images") {
-				layer_param->mutable_image_data_param()->set_new_width(
-						v0_layer_param.new_width());
-			} else {
-				LOG(ERROR) << "Unknown parameter new_width for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_concat_dim()) {
-			if (type == "concat") {
-				layer_param->mutable_concat_param()->set_concat_dim(
-						v0_layer_param.concat_dim());
-			} else {
-				LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_det_fg_threshold()) {
-			if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_fg_threshold(
-						v0_layer_param.det_fg_threshold());
-			} else {
-				LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_det_bg_threshold()) {
-			if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_bg_threshold(
-						v0_layer_param.det_bg_threshold());
-			} else {
-				LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_det_fg_fraction()) {
-			if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_fg_fraction(
-						v0_layer_param.det_fg_fraction());
-			} else {
-				LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_det_context_pad()) {
-			if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_context_pad(
-						v0_layer_param.det_context_pad());
-			} else {
-				LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_det_crop_mode()) {
-			if (type == "window_data") {
-				layer_param->mutable_window_data_param()->set_crop_mode(
-						v0_layer_param.det_crop_mode());
-			} else {
-				LOG(ERROR) << "Unknown parameter det_crop_mode for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-		if (v0_layer_param.has_hdf5_output_param()) {
-			if (type == "hdf5_output") {
-				layer_param->mutable_hdf5_output_param()->CopyFrom(
-						v0_layer_param.hdf5_output_param());
-			} else {
-				LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
-						<< type;
-				is_fully_compatible = false;
-			}
-		}
-	}
-	return is_fully_compatible;
+    V1LayerParameter* layer_param) {
+  bool is_fully_compatible = true;
+  layer_param->Clear();
+  for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
+    layer_param->add_bottom(v0_layer_connection.bottom(i));
+  }
+  for (int i = 0; i < v0_layer_connection.top_size(); ++i) {
+    layer_param->add_top(v0_layer_connection.top(i));
+  }
+  if (v0_layer_connection.has_layer()) {
+    const V0LayerParameter& v0_layer_param = v0_layer_connection.layer();
+    if (v0_layer_param.has_name()) {
+      layer_param->set_name(v0_layer_param.name());
+    }
+    const string& type = v0_layer_param.type();
+    if (v0_layer_param.has_type()) {
+      layer_param->set_type(UpgradeV0LayerType(type));
+    }
+    for (int i = 0; i < v0_layer_param.blobs_size(); ++i) {
+      layer_param->add_blobs()->CopyFrom(v0_layer_param.blobs(i));
+    }
+    for (int i = 0; i < v0_layer_param.blobs_lr_size(); ++i) {
+      layer_param->add_blobs_lr(v0_layer_param.blobs_lr(i));
+    }
+    for (int i = 0; i < v0_layer_param.weight_decay_size(); ++i) {
+      layer_param->add_weight_decay(v0_layer_param.weight_decay(i));
+    }
+    if (v0_layer_param.has_num_output()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_num_output(
+            v0_layer_param.num_output());
+      } else if (type == "innerproduct") {
+        layer_param->mutable_inner_product_param()->set_num_output(
+            v0_layer_param.num_output());
+      } else {
+        LOG(ERROR) << "Unknown parameter num_output for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_biasterm()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_bias_term(
+            v0_layer_param.biasterm());
+      } else if (type == "innerproduct") {
+        layer_param->mutable_inner_product_param()->set_bias_term(
+            v0_layer_param.biasterm());
+      } else {
+        LOG(ERROR) << "Unknown parameter biasterm for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_weight_filler()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->mutable_weight_filler()->CopyFrom(
+            v0_layer_param.weight_filler());
+      } else if (type == "innerproduct") {
+        layer_param->mutable_inner_product_param()->mutable_weight_filler()->CopyFrom(
+            v0_layer_param.weight_filler());
+      } else {
+        LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_bias_filler()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->mutable_bias_filler()->CopyFrom(
+            v0_layer_param.bias_filler());
+      } else if (type == "innerproduct") {
+        layer_param->mutable_inner_product_param()->mutable_bias_filler()->CopyFrom(
+            v0_layer_param.bias_filler());
+      } else {
+        LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_pad()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
+      } else if (type == "pool") {
+        layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
+      } else {
+        LOG(ERROR) << "Unknown parameter pad for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_kernelsize()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_kernel_size(
+            v0_layer_param.kernelsize());
+      } else if (type == "pool") {
+        layer_param->mutable_pooling_param()->set_kernel_size(
+            v0_layer_param.kernelsize());
+      } else {
+        LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_group()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_group(
+            v0_layer_param.group());
+      } else {
+        LOG(ERROR) << "Unknown parameter group for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_stride()) {
+      if (type == "conv") {
+        layer_param->mutable_convolution_param()->set_stride(
+            v0_layer_param.stride());
+      } else if (type == "pool") {
+        layer_param->mutable_pooling_param()->set_stride(
+            v0_layer_param.stride());
+      } else {
+        LOG(ERROR) << "Unknown parameter stride for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_pool()) {
+      if (type == "pool") {
+        V0LayerParameter_PoolMethod pool = v0_layer_param.pool();
+        switch (pool) {
+        case V0LayerParameter_PoolMethod_MAX:
+          layer_param->mutable_pooling_param()->set_pool(
+              PoolingParameter_PoolMethod_MAX);
+          break;
+        case V0LayerParameter_PoolMethod_AVE:
+          layer_param->mutable_pooling_param()->set_pool(
+              PoolingParameter_PoolMethod_AVE);
+          break;
+        case V0LayerParameter_PoolMethod_STOCHASTIC:
+          layer_param->mutable_pooling_param()->set_pool(
+              PoolingParameter_PoolMethod_STOCHASTIC);
+          break;
+        default:
+          LOG(ERROR) << "Unknown pool method " << pool;
+          is_fully_compatible = false;
+        }
+      } else {
+        LOG(ERROR) << "Unknown parameter pool for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_dropout_ratio()) {
+      if (type == "dropout") {
+        layer_param->mutable_dropout_param()->set_dropout_ratio(
+            v0_layer_param.dropout_ratio());
+      } else {
+        LOG(ERROR) << "Unknown parameter dropout_ratio for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_local_size()) {
+      if (type == "lrn") {
+        layer_param->mutable_lrn_param()->set_local_size(
+            v0_layer_param.local_size());
+      } else {
+        LOG(ERROR) << "Unknown parameter local_size for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_alpha()) {
+      if (type == "lrn") {
+        layer_param->mutable_lrn_param()->set_alpha(v0_layer_param.alpha());
+      } else {
+        LOG(ERROR) << "Unknown parameter alpha for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_beta()) {
+      if (type == "lrn") {
+        layer_param->mutable_lrn_param()->set_beta(v0_layer_param.beta());
+      } else {
+        LOG(ERROR) << "Unknown parameter beta for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_k()) {
+      if (type == "lrn") {
+        layer_param->mutable_lrn_param()->set_k(v0_layer_param.k());
+      } else {
+        LOG(ERROR) << "Unknown parameter k for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_source()) {
+      if (type == "data") {
+        layer_param->mutable_data_param()->set_source(v0_layer_param.source());
+      } else if (type == "hdf5_data") {
+        layer_param->mutable_hdf5_data_param()->set_source(
+            v0_layer_param.source());
+      } else if (type == "images") {
+        layer_param->mutable_image_data_param()->set_source(
+            v0_layer_param.source());
+      } else if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_source(
+            v0_layer_param.source());
+      } else if (type == "infogain_loss") {
+        layer_param->mutable_infogain_loss_param()->set_source(
+            v0_layer_param.source());
+      } else {
+        LOG(ERROR) << "Unknown parameter source for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_scale()) {
+      layer_param->mutable_transform_param()->set_scale(v0_layer_param.scale());
+    }
+    if (v0_layer_param.has_meanfile()) {
+      layer_param->mutable_transform_param()->set_mean_file(
+          v0_layer_param.meanfile());
+    }
+    if (v0_layer_param.has_batchsize()) {
+      if (type == "data") {
+        layer_param->mutable_data_param()->set_batch_size(
+            v0_layer_param.batchsize());
+      } else if (type == "hdf5_data") {
+        layer_param->mutable_hdf5_data_param()->set_batch_size(
+            v0_layer_param.batchsize());
+      } else if (type == "images") {
+        layer_param->mutable_image_data_param()->set_batch_size(
+            v0_layer_param.batchsize());
+      } else if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_batch_size(
+            v0_layer_param.batchsize());
+      } else {
+        LOG(ERROR) << "Unknown parameter batchsize for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_cropsize()) {
+      layer_param->mutable_transform_param()->set_crop_size(
+          v0_layer_param.cropsize());
+    }
+    if (v0_layer_param.has_mirror()) {
+      layer_param->mutable_transform_param()->set_mirror(
+          v0_layer_param.mirror());
+    }
+    if (v0_layer_param.has_rand_skip()) {
+      if (type == "data") {
+        layer_param->mutable_data_param()->set_rand_skip(
+            v0_layer_param.rand_skip());
+      } else if (type == "images") {
+        layer_param->mutable_image_data_param()->set_rand_skip(
+            v0_layer_param.rand_skip());
+      } else {
+        LOG(ERROR) << "Unknown parameter rand_skip for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_shuffle_images()) {
+      if (type == "images") {
+        layer_param->mutable_image_data_param()->set_shuffle(
+            v0_layer_param.shuffle_images());
+      } else {
+        LOG(ERROR) << "Unknown parameter shuffle for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_new_height()) {
+      if (type == "images") {
+        layer_param->mutable_image_data_param()->set_new_height(
+            v0_layer_param.new_height());
+      } else {
+        LOG(ERROR) << "Unknown parameter new_height for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_new_width()) {
+      if (type == "images") {
+        layer_param->mutable_image_data_param()->set_new_width(
+            v0_layer_param.new_width());
+      } else {
+        LOG(ERROR) << "Unknown parameter new_width for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_concat_dim()) {
+      if (type == "concat") {
+        layer_param->mutable_concat_param()->set_concat_dim(
+            v0_layer_param.concat_dim());
+      } else {
+        LOG(ERROR) << "Unknown parameter concat_dim for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_det_fg_threshold()) {
+      if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_fg_threshold(
+            v0_layer_param.det_fg_threshold());
+      } else {
+        LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
+            << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_det_bg_threshold()) {
+      if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_bg_threshold(
+            v0_layer_param.det_bg_threshold());
+      } else {
+        LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
+            << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_det_fg_fraction()) {
+      if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_fg_fraction(
+            v0_layer_param.det_fg_fraction());
+      } else {
+        LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
+            << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_det_context_pad()) {
+      if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_context_pad(
+            v0_layer_param.det_context_pad());
+      } else {
+        LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
+            << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_det_crop_mode()) {
+      if (type == "window_data") {
+        layer_param->mutable_window_data_param()->set_crop_mode(
+            v0_layer_param.det_crop_mode());
+      } else {
+        LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " << type;
+        is_fully_compatible = false;
+      }
+    }
+    if (v0_layer_param.has_hdf5_output_param()) {
+      if (type == "hdf5_output") {
+        layer_param->mutable_hdf5_output_param()->CopyFrom(
+            v0_layer_param.hdf5_output_param());
+      } else {
+        LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
+            << type;
+        is_fully_compatible = false;
+      }
+    }
+  }
+  return is_fully_compatible;
 }
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type) {
-	if (type == "accuracy") {
-		return V1LayerParameter_LayerType_ACCURACY;
-	} else if (type == "bnll") {
-		return V1LayerParameter_LayerType_BNLL;
-	} else if (type == "concat") {
-		return V1LayerParameter_LayerType_CONCAT;
-	} else if (type == "conv") {
-		return V1LayerParameter_LayerType_CONVOLUTION;
-	} else if (type == "data") {
-		return V1LayerParameter_LayerType_DATA;
-	} else if (type == "dropout") {
-		return V1LayerParameter_LayerType_DROPOUT;
-	} else if (type == "euclidean_loss") {
-		return V1LayerParameter_LayerType_EUCLIDEAN_LOSS;
-	} else if (type == "flatten") {
-		return V1LayerParameter_LayerType_FLATTEN;
-	} else if (type == "hdf5_data") {
-		return V1LayerParameter_LayerType_HDF5_DATA;
-	} else if (type == "hdf5_output") {
-		return V1LayerParameter_LayerType_HDF5_OUTPUT;
-	} else if (type == "im2col") {
-		return V1LayerParameter_LayerType_IM2COL;
-	} else if (type == "images") {
-		return V1LayerParameter_LayerType_IMAGE_DATA;
-	} else if (type == "infogain_loss") {
-		return V1LayerParameter_LayerType_INFOGAIN_LOSS;
-	} else if (type == "innerproduct") {
-		return V1LayerParameter_LayerType_INNER_PRODUCT;
-	} else if (type == "lrn") {
-		return V1LayerParameter_LayerType_LRN;
-	} else if (type == "multinomial_logistic_loss") {
-		return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS;
-	} else if (type == "pool") {
-		return V1LayerParameter_LayerType_POOLING;
-	} else if (type == "relu") {
-		return V1LayerParameter_LayerType_RELU;
-	} else if (type == "sigmoid") {
-		return V1LayerParameter_LayerType_SIGMOID;
-	} else if (type == "softmax") {
-		return V1LayerParameter_LayerType_SOFTMAX;
-	} else if (type == "softmax_loss") {
-		return V1LayerParameter_LayerType_SOFTMAX_LOSS;
-	} else if (type == "split") {
-		return V1LayerParameter_LayerType_SPLIT;
-	} else if (type == "tanh") {
-		return V1LayerParameter_LayerType_TANH;
-	} else if (type == "window_data") {
-		return V1LayerParameter_LayerType_WINDOW_DATA;
-	} else {
-		LOG(FATAL) << "Unknown layer name: " << type;
-		return V1LayerParameter_LayerType_NONE;
-	}
+  if (type == "accuracy") {
+    return V1LayerParameter_LayerType_ACCURACY;
+  } else if (type == "bnll") {
+    return V1LayerParameter_LayerType_BNLL;
+  } else if (type == "concat") {
+    return V1LayerParameter_LayerType_CONCAT;
+  } else if (type == "conv") {
+    return V1LayerParameter_LayerType_CONVOLUTION;
+  } else if (type == "data") {
+    return V1LayerParameter_LayerType_DATA;
+  } else if (type == "dropout") {
+    return V1LayerParameter_LayerType_DROPOUT;
+  } else if (type == "euclidean_loss") {
+    return V1LayerParameter_LayerType_EUCLIDEAN_LOSS;
+  } else if (type == "flatten") {
+    return V1LayerParameter_LayerType_FLATTEN;
+  } else if (type == "hdf5_data") {
+    return V1LayerParameter_LayerType_HDF5_DATA;
+  } else if (type == "hdf5_output") {
+    return V1LayerParameter_LayerType_HDF5_OUTPUT;
+  } else if (type == "im2col") {
+    return V1LayerParameter_LayerType_IM2COL;
+  } else if (type == "images") {
+    return V1LayerParameter_LayerType_IMAGE_DATA;
+  } else if (type == "infogain_loss") {
+    return V1LayerParameter_LayerType_INFOGAIN_LOSS;
+  } else if (type == "innerproduct") {
+    return V1LayerParameter_LayerType_INNER_PRODUCT;
+  } else if (type == "lrn") {
+    return V1LayerParameter_LayerType_LRN;
+  } else if (type == "multinomial_logistic_loss") {
+    return V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS;
+  } else if (type == "pool") {
+    return V1LayerParameter_LayerType_POOLING;
+  } else if (type == "relu") {
+    return V1LayerParameter_LayerType_RELU;
+  } else if (type == "sigmoid") {
+    return V1LayerParameter_LayerType_SIGMOID;
+  } else if (type == "softmax") {
+    return V1LayerParameter_LayerType_SOFTMAX;
+  } else if (type == "softmax_loss") {
+    return V1LayerParameter_LayerType_SOFTMAX_LOSS;
+  } else if (type == "split") {
+    return V1LayerParameter_LayerType_SPLIT;
+  } else if (type == "tanh") {
+    return V1LayerParameter_LayerType_TANH;
+  } else if (type == "window_data") {
+    return V1LayerParameter_LayerType_WINDOW_DATA;
+  } else {
+    LOG(FATAL) << "Unknown layer name: " << type;
+    return V1LayerParameter_LayerType_NONE;
+  }
 }
 
 bool NetNeedsDataUpgrade(const NetParameter& net_param) {
-	for (int i = 0; i < net_param.layers_size(); ++i) {
-		if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) {
-			DataParameter layer_param = net_param.layers(i).data_param();
-			if (layer_param.has_scale()) {
-				return true;
-			}
-			if (layer_param.has_mean_file()) {
-				return true;
-			}
-			if (layer_param.has_crop_size()) {
-				return true;
-			}
-			if (layer_param.has_mirror()) {
-				return true;
-			}
-		}
-		if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) {
-			ImageDataParameter layer_param = net_param.layers(i).image_data_param();
-			if (layer_param.has_scale()) {
-				return true;
-			}
-			if (layer_param.has_mean_file()) {
-				return true;
-			}
-			if (layer_param.has_crop_size()) {
-				return true;
-			}
-			if (layer_param.has_mirror()) {
-				return true;
-			}
-		}
-		if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) {
-			WindowDataParameter layer_param = net_param.layers(i).window_data_param();
-			if (layer_param.has_scale()) {
-				return true;
-			}
-			if (layer_param.has_mean_file()) {
-				return true;
-			}
-			if (layer_param.has_crop_size()) {
-				return true;
-			}
-			if (layer_param.has_mirror()) {
-				return true;
-			}
-		}
-	}
-	return false;
+  for (int i = 0; i < net_param.layers_size(); ++i) {
+    if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) {
+      DataParameter layer_param = net_param.layers(i).data_param();
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
+    }
+    if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) {
+      ImageDataParameter layer_param = net_param.layers(i).image_data_param();
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
+    }
+    if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) {
+      WindowDataParameter layer_param = net_param.layers(i).window_data_param();
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
+    }
+  }
+  return false;
 }
 
 #define CONVERT_LAYER_TRANSFORM_PARAM(TYPE, Name, param_name) \
@@ -600,373 +598,364 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) {
   } while (0)
 
 void UpgradeNetDataTransformation(NetParameter* net_param) {
-	for (int i = 0; i < net_param->layers_size(); ++i) {
-		CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data);
-		CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data);
-		CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data);
-	}
+  for (int i = 0; i < net_param->layers_size(); ++i) {
+    CONVERT_LAYER_TRANSFORM_PARAM(DATA, Data, data);
+    CONVERT_LAYER_TRANSFORM_PARAM(IMAGE_DATA, ImageData, image_data);
+    CONVERT_LAYER_TRANSFORM_PARAM(WINDOW_DATA, WindowData, window_data);
+  }
 }
 
 bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
-	bool success = true;
-	if (NetNeedsV0ToV1Upgrade(*param)) {
-		// NetParameter was specified using the old style (V0LayerParameter); try to
-		// upgrade it.
-		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-				<< "V0LayerParameter: " << param_file;
-		NetParameter original_param(*param);
-		if (!UpgradeV0Net(original_param, param)) {
-			success = false;
-			LOG(ERROR) << "Warning: had one or more problems upgrading "
-					<< "V0NetParameter to NetParameter (see above); continuing anyway.";
-		} else {
-			LOG(INFO) << "Successfully upgraded file specified using deprecated "
-					<< "V0LayerParameter";
-		}
-		LOG(ERROR) << "Note that future Caffe releases will not support "
-				<< "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
-				<< "prototxt and ./build/tools/upgrade_net_proto_binary for model "
-				<< "weights upgrade this and any other net protos to the new format.";
-	}
-	// NetParameter uses old style data transformation fields; try to upgrade it.
-	if (NetNeedsDataUpgrade(*param)) {
-		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-				<< "transformation parameters: " << param_file;
-		UpgradeNetDataTransformation(param);
-		LOG(INFO) << "Successfully upgraded file specified using deprecated "
-				<< "data transformation parameters.";
-		LOG(ERROR) << "Note that future Caffe releases will only support "
-				<< "transform_param messages for transformation fields.";
-	}
-	if (NetNeedsV1ToV2Upgrade(*param)) {
-		LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-				<< "V1LayerParameter: " << param_file;
-		NetParameter original_param(*param);
-		if (!UpgradeV1Net(original_param, param)) {
-			success = false;
-			LOG(ERROR) << "Warning: had one or more problems upgrading "
-					<< "V1LayerParameter (see above); continuing anyway.";
-		} else {
-			LOG(INFO) << "Successfully upgraded file specified using deprecated "
-					<< "V1LayerParameter";
-		}
-	}
-	return success;
+  bool success = true;
+  if (NetNeedsV0ToV1Upgrade(*param)) {
+    // NetParameter was specified using the old style (V0LayerParameter); try to
+    // upgrade it.
+    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+        << "V0LayerParameter: " << param_file;
+    NetParameter original_param(*param);
+    if (!UpgradeV0Net(original_param, param)) {
+      success = false;
+      LOG(ERROR) << "Warning: had one or more problems upgrading "
+          << "V0NetParameter to NetParameter (see above); continuing anyway.";
+    } else {
+      LOG(INFO) << "Successfully upgraded file specified using deprecated "
+          << "V0LayerParameter";
+    }
+    LOG(ERROR) << "Note that future Caffe releases will not support "
+        << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
+        << "prototxt and ./build/tools/upgrade_net_proto_binary for model "
+        << "weights upgrade this and any other net protos to the new format.";
+  }
+  // NetParameter uses old style data transformation fields; try to upgrade it.
+  if (NetNeedsDataUpgrade(*param)) {
+    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+        << "transformation parameters: " << param_file;
+    UpgradeNetDataTransformation(param);
+    LOG(INFO) << "Successfully upgraded file specified using deprecated "
+        << "data transformation parameters.";
+    LOG(ERROR) << "Note that future Caffe releases will only support "
+        << "transform_param messages for transformation fields.";
+  }
+  if (NetNeedsV1ToV2Upgrade(*param)) {
+    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
+        << "V1LayerParameter: " << param_file;
+    NetParameter original_param(*param);
+    if (!UpgradeV1Net(original_param, param)) {
+      success = false;
+      LOG(ERROR) << "Warning: had one or more problems upgrading "
+          << "V1LayerParameter (see above); continuing anyway.";
+    } else {
+      LOG(INFO) << "Successfully upgraded file specified using deprecated "
+          << "V1LayerParameter";
+    }
+  }
+  return success;
 }
 
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
-	bool is_fully_compatible = true;
-	if (v1_net_param.layer_size() > 0) {
-		LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
-				<< "fields; these will be ignored for the upgrade.";
-		is_fully_compatible = false;
-	}
-	net_param->CopyFrom(v1_net_param);
-	net_param->clear_layers();
-	net_param->clear_layer();
-	for (int i = 0; i < v1_net_param.layers_size(); ++i) {
-		if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
-				net_param->add_layer())) {
-			LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
-			is_fully_compatible = false;
-		}
-	}
-	return is_fully_compatible;
+  bool is_fully_compatible = true;
+  if (v1_net_param.layer_size() > 0) {
+    LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
+        << "fields; these will be ignored for the upgrade.";
+    is_fully_compatible = false;
+  }
+  net_param->CopyFrom(v1_net_param);
+  net_param->clear_layers();
+  net_param->clear_layer();
+  for (int i = 0; i < v1_net_param.layers_size(); ++i) {
+    if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
+        net_param->add_layer())) {
+      LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
+      is_fully_compatible = false;
+    }
+  }
+  return is_fully_compatible;
 }
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-		LayerParameter* layer_param) {
-	layer_param->Clear();
-	bool is_fully_compatible = true;
-	for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
-		layer_param->add_bottom(v1_layer_param.bottom(i));
-	}
-	for (int i = 0; i < v1_layer_param.top_size(); ++i) {
-		layer_param->add_top(v1_layer_param.top(i));
-	}
-	if (v1_layer_param.has_name()) {
-		layer_param->set_name(v1_layer_param.name());
-	}
-	for (int i = 0; i < v1_layer_param.include_size(); ++i) {
-		layer_param->add_include()->CopyFrom(v1_layer_param.include(i));
-	}
-	for (int i = 0; i < v1_layer_param.exclude_size(); ++i) {
-		layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i));
-	}
-	if (v1_layer_param.has_type()) {
-		layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type()));
-	}
-	for (int i = 0; i < v1_layer_param.blobs_size(); ++i) {
-		layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i));
-	}
-	for (int i = 0; i < v1_layer_param.param_size(); ++i) {
-		while (layer_param->param_size() <= i) {
-			layer_param->add_param();
-		}
-		layer_param->mutable_param(i)->set_name(v1_layer_param.param(i));
-	}
-	ParamSpec_DimCheckMode mode;
-	for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) {
-		while (layer_param->param_size() <= i) {
-			layer_param->add_param();
-		}
-		switch (v1_layer_param.blob_share_mode(i)) {
-			case V1LayerParameter_DimCheckMode_STRICT:
-				mode = ParamSpec_DimCheckMode_STRICT;
-				break;
-			case V1LayerParameter_DimCheckMode_PERMISSIVE:
-				mode = ParamSpec_DimCheckMode_PERMISSIVE;
-				break;
-			default:
-				LOG(FATAL) << "Unknown blob_share_mode: "
-						<< v1_layer_param.blob_share_mode(i);
-				break;
-		}
-		layer_param->mutable_param(i)->set_share_mode(mode);
-	}
-	for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) {
-		while (layer_param->param_size() <= i) {
-			layer_param->add_param();
-		}
-		layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i));
-	}
-	for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) {
-		while (layer_param->param_size() <= i) {
-			layer_param->add_param();
-		}
-		layer_param->mutable_param(i)->set_decay_mult(
-				v1_layer_param.weight_decay(i));
-	}
-	for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) {
-		layer_param->add_loss_weight(v1_layer_param.loss_weight(i));
-	}
-	if (v1_layer_param.has_accuracy_param()) {
-		layer_param->mutable_accuracy_param()->CopyFrom(
-				v1_layer_param.accuracy_param());
-	}
-	if (v1_layer_param.has_argmax_param()) {
-		layer_param->mutable_argmax_param()->CopyFrom(
-				v1_layer_param.argmax_param());
-	}
-	if (v1_layer_param.has_concat_param()) {
-		layer_param->mutable_concat_param()->CopyFrom(
-				v1_layer_param.concat_param());
-	}
-	if (v1_layer_param.has_contrastive_loss_param()) {
-		layer_param->mutable_contrastive_loss_param()->CopyFrom(
-				v1_layer_param.contrastive_loss_param());
-	}
-	if (v1_layer_param.has_convolution_param()) {
-		layer_param->mutable_convolution_param()->CopyFrom(
-				v1_layer_param.convolution_param());
-	}
-	if (v1_layer_param.has_data_param()) {
-		layer_param->mutable_data_param()->CopyFrom(
-				v1_layer_param.data_param());
-	}
-	if (v1_layer_param.has_dropout_param()) {
-		layer_param->mutable_dropout_param()->CopyFrom(
-				v1_layer_param.dropout_param());
-	}
-	if (v1_layer_param.has_dummy_data_param()) {
-		layer_param->mutable_dummy_data_param()->CopyFrom(
-				v1_layer_param.dummy_data_param());
-	}
-	if (v1_layer_param.has_eltwise_param()) {
-		layer_param->mutable_eltwise_param()->CopyFrom(
-				v1_layer_param.eltwise_param());
-	}
-	if (v1_layer_param.has_exp_param()) {
-		layer_param->mutable_exp_param()->CopyFrom(
-				v1_layer_param.exp_param());
-	}
-	if (v1_layer_param.has_hdf5_data_param()) {
-		layer_param->mutable_hdf5_data_param()->CopyFrom(
-				v1_layer_param.hdf5_data_param());
-	}
-	if (v1_layer_param.has_hdf5_output_param()) {
-		layer_param->mutable_hdf5_output_param()->CopyFrom(
-				v1_layer_param.hdf5_output_param());
-	}
-	if (v1_layer_param.has_hinge_loss_param()) {
-		layer_param->mutable_hinge_loss_param()->CopyFrom(
-				v1_layer_param.hinge_loss_param());
-	}
-	if (v1_layer_param.has_image_data_param()) {
-		layer_param->mutable_image_data_param()->CopyFrom(
-				v1_layer_param.image_data_param());
-	}
-	if (v1_layer_param.has_infogain_loss_param()) {
-		layer_param->mutable_infogain_loss_param()->CopyFrom(
-				v1_layer_param.infogain_loss_param());
-	}
-	if (v1_layer_param.has_inner_product_param()) {
-		layer_param->mutable_inner_product_param()->CopyFrom(
-				v1_layer_param.inner_product_param());
-	}
-	if (v1_layer_param.has_lrn_param()) {
-		layer_param->mutable_lrn_param()->CopyFrom(
-				v1_layer_param.lrn_param());
-	}
-	if (v1_layer_param.has_memory_data_param()) {
-		layer_param->mutable_memory_data_param()->CopyFrom(
-				v1_layer_param.memory_data_param());
-	}
-	if (v1_layer_param.has_mvn_param()) {
-		layer_param->mutable_mvn_param()->CopyFrom(
-				v1_layer_param.mvn_param());
-	}
-	if (v1_layer_param.has_pooling_param()) {
-		layer_param->mutable_pooling_param()->CopyFrom(
-				v1_layer_param.pooling_param());
-	}
-	if (v1_layer_param.has_power_param()) {
-		layer_param->mutable_power_param()->CopyFrom(
-				v1_layer_param.power_param());
-	}
-	if (v1_layer_param.has_relu_param()) {
-		layer_param->mutable_relu_param()->CopyFrom(
-				v1_layer_param.relu_param());
-	}
-	if (v1_layer_param.has_sigmoid_param()) {
-		layer_param->mutable_sigmoid_param()->CopyFrom(
-				v1_layer_param.sigmoid_param());
-	}
-	if (v1_layer_param.has_softmax_param()) {
-		layer_param->mutable_softmax_param()->CopyFrom(
-				v1_layer_param.softmax_param());
-	}
-	if (v1_layer_param.has_slice_param()) {
-		layer_param->mutable_slice_param()->CopyFrom(
-				v1_layer_param.slice_param());
-	}
-	if (v1_layer_param.has_tanh_param()) {
-		layer_param->mutable_tanh_param()->CopyFrom(
-				v1_layer_param.tanh_param());
-	}
-	if (v1_layer_param.has_threshold_param()) {
-		layer_param->mutable_threshold_param()->CopyFrom(
-				v1_layer_param.threshold_param());
-	}
-	if (v1_layer_param.has_window_data_param()) {
-		layer_param->mutable_window_data_param()->CopyFrom(
-				v1_layer_param.window_data_param());
-	}
-	if (v1_layer_param.has_transform_param()) {
-		layer_param->mutable_transform_param()->CopyFrom(
-				v1_layer_param.transform_param());
-	}
-	if (v1_layer_param.has_loss_param()) {
-		layer_param->mutable_loss_param()->CopyFrom(
-				v1_layer_param.loss_param());
-	}
-	if (v1_layer_param.has_layer()) {
-		LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
-		is_fully_compatible = false;
-	}
-	return is_fully_compatible;
+    LayerParameter* layer_param) {
+  layer_param->Clear();
+  bool is_fully_compatible = true;
+  for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
+    layer_param->add_bottom(v1_layer_param.bottom(i));
+  }
+  for (int i = 0; i < v1_layer_param.top_size(); ++i) {
+    layer_param->add_top(v1_layer_param.top(i));
+  }
+  if (v1_layer_param.has_name()) {
+    layer_param->set_name(v1_layer_param.name());
+  }
+  for (int i = 0; i < v1_layer_param.include_size(); ++i) {
+    layer_param->add_include()->CopyFrom(v1_layer_param.include(i));
+  }
+  for (int i = 0; i < v1_layer_param.exclude_size(); ++i) {
+    layer_param->add_exclude()->CopyFrom(v1_layer_param.exclude(i));
+  }
+  if (v1_layer_param.has_type()) {
+    layer_param->set_type(UpgradeV1LayerType(v1_layer_param.type()));
+  }
+  for (int i = 0; i < v1_layer_param.blobs_size(); ++i) {
+    layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i));
+  }
+  for (int i = 0; i < v1_layer_param.param_size(); ++i) {
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
+    layer_param->mutable_param(i)->set_name(v1_layer_param.param(i));
+  }
+  ParamSpec_DimCheckMode mode;
+  for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) {
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
+    switch (v1_layer_param.blob_share_mode(i)) {
+    case V1LayerParameter_DimCheckMode_STRICT:
+      mode = ParamSpec_DimCheckMode_STRICT;
+      break;
+    case V1LayerParameter_DimCheckMode_PERMISSIVE:
+      mode = ParamSpec_DimCheckMode_PERMISSIVE;
+      break;
+    default:
+      LOG(FATAL) << "Unknown blob_share_mode: "
+          << v1_layer_param.blob_share_mode(i);
+      break;
+    }
+    layer_param->mutable_param(i)->set_share_mode(mode);
+  }
+  for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) {
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
+    layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i));
+  }
+  for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) {
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
+    layer_param->mutable_param(i)->set_decay_mult(
+        v1_layer_param.weight_decay(i));
+  }
+  for (int i = 0; i < v1_layer_param.loss_weight_size(); ++i) {
+    layer_param->add_loss_weight(v1_layer_param.loss_weight(i));
+  }
+  if (v1_layer_param.has_accuracy_param()) {
+    layer_param->mutable_accuracy_param()->CopyFrom(
+        v1_layer_param.accuracy_param());
+  }
+  if (v1_layer_param.has_argmax_param()) {
+    layer_param->mutable_argmax_param()->CopyFrom(
+        v1_layer_param.argmax_param());
+  }
+  if (v1_layer_param.has_concat_param()) {
+    layer_param->mutable_concat_param()->CopyFrom(
+        v1_layer_param.concat_param());
+  }
+  if (v1_layer_param.has_contrastive_loss_param()) {
+    layer_param->mutable_contrastive_loss_param()->CopyFrom(
+        v1_layer_param.contrastive_loss_param());
+  }
+  if (v1_layer_param.has_convolution_param()) {
+    layer_param->mutable_convolution_param()->CopyFrom(
+        v1_layer_param.convolution_param());
+  }
+  if (v1_layer_param.has_data_param()) {
+    layer_param->mutable_data_param()->CopyFrom(v1_layer_param.data_param());
+  }
+  if (v1_layer_param.has_dropout_param()) {
+    layer_param->mutable_dropout_param()->CopyFrom(
+        v1_layer_param.dropout_param());
+  }
+  if (v1_layer_param.has_dummy_data_param()) {
+    layer_param->mutable_dummy_data_param()->CopyFrom(
+        v1_layer_param.dummy_data_param());
+  }
+  if (v1_layer_param.has_eltwise_param()) {
+    layer_param->mutable_eltwise_param()->CopyFrom(
+        v1_layer_param.eltwise_param());
+  }
+  if (v1_layer_param.has_exp_param()) {
+    layer_param->mutable_exp_param()->CopyFrom(v1_layer_param.exp_param());
+  }
+  if (v1_layer_param.has_hdf5_data_param()) {
+    layer_param->mutable_hdf5_data_param()->CopyFrom(
+        v1_layer_param.hdf5_data_param());
+  }
+  if (v1_layer_param.has_hdf5_output_param()) {
+    layer_param->mutable_hdf5_output_param()->CopyFrom(
+        v1_layer_param.hdf5_output_param());
+  }
+  if (v1_layer_param.has_hinge_loss_param()) {
+    layer_param->mutable_hinge_loss_param()->CopyFrom(
+        v1_layer_param.hinge_loss_param());
+  }
+  if (v1_layer_param.has_image_data_param()) {
+    layer_param->mutable_image_data_param()->CopyFrom(
+        v1_layer_param.image_data_param());
+  }
+  if (v1_layer_param.has_infogain_loss_param()) {
+    layer_param->mutable_infogain_loss_param()->CopyFrom(
+        v1_layer_param.infogain_loss_param());
+  }
+  if (v1_layer_param.has_inner_product_param()) {
+    layer_param->mutable_inner_product_param()->CopyFrom(
+        v1_layer_param.inner_product_param());
+  }
+  if (v1_layer_param.has_lrn_param()) {
+    layer_param->mutable_lrn_param()->CopyFrom(v1_layer_param.lrn_param());
+  }
+  if (v1_layer_param.has_memory_data_param()) {
+    layer_param->mutable_memory_data_param()->CopyFrom(
+        v1_layer_param.memory_data_param());
+  }
+  if (v1_layer_param.has_mvn_param()) {
+    layer_param->mutable_mvn_param()->CopyFrom(v1_layer_param.mvn_param());
+  }
+  if (v1_layer_param.has_pooling_param()) {
+    layer_param->mutable_pooling_param()->CopyFrom(
+        v1_layer_param.pooling_param());
+  }
+  if (v1_layer_param.has_power_param()) {
+    layer_param->mutable_power_param()->CopyFrom(v1_layer_param.power_param());
+  }
+  if (v1_layer_param.has_relu_param()) {
+    layer_param->mutable_relu_param()->CopyFrom(v1_layer_param.relu_param());
+  }
+  if (v1_layer_param.has_sigmoid_param()) {
+    layer_param->mutable_sigmoid_param()->CopyFrom(
+        v1_layer_param.sigmoid_param());
+  }
+  if (v1_layer_param.has_softmax_param()) {
+    layer_param->mutable_softmax_param()->CopyFrom(
+        v1_layer_param.softmax_param());
+  }
+  if (v1_layer_param.has_slice_param()) {
+    layer_param->mutable_slice_param()->CopyFrom(v1_layer_param.slice_param());
+  }
+  if (v1_layer_param.has_tanh_param()) {
+    layer_param->mutable_tanh_param()->CopyFrom(v1_layer_param.tanh_param());
+  }
+  if (v1_layer_param.has_threshold_param()) {
+    layer_param->mutable_threshold_param()->CopyFrom(
+        v1_layer_param.threshold_param());
+  }
+  if (v1_layer_param.has_window_data_param()) {
+    layer_param->mutable_window_data_param()->CopyFrom(
+        v1_layer_param.window_data_param());
+  }
+  if (v1_layer_param.has_transform_param()) {
+    layer_param->mutable_transform_param()->CopyFrom(
+        v1_layer_param.transform_param());
+  }
+  if (v1_layer_param.has_loss_param()) {
+    layer_param->mutable_loss_param()->CopyFrom(v1_layer_param.loss_param());
+  }
+  if (v1_layer_param.has_layer()) {
+    LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
+    is_fully_compatible = false;
+  }
+  return is_fully_compatible;
 }
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
-	switch (type) {
-		case V1LayerParameter_LayerType_NONE:
-			return "";
-		case V1LayerParameter_LayerType_ABSVAL:
-			return "AbsVal";
-		case V1LayerParameter_LayerType_ACCURACY:
-			return "Accuracy";
-		case V1LayerParameter_LayerType_ARGMAX:
-			return "ArgMax";
-		case V1LayerParameter_LayerType_BNLL:
-			return "BNLL";
-		case V1LayerParameter_LayerType_CONCAT:
-			return "Concat";
-		case V1LayerParameter_LayerType_CONTRASTIVE_LOSS:
-			return "ContrastiveLoss";
-		case V1LayerParameter_LayerType_CONVOLUTION:
-			return "Convolution";
-		case V1LayerParameter_LayerType_DECONVOLUTION:
-			return "Deconvolution";
-		case V1LayerParameter_LayerType_DATA:
-			return "Data";
-		case V1LayerParameter_LayerType_DROPOUT:
-			return "Dropout";
-		case V1LayerParameter_LayerType_DUMMY_DATA:
-			return "DummyData";
-		case V1LayerParameter_LayerType_EUCLIDEAN_LOSS:
-			return "EuclideanLoss";
-		case V1LayerParameter_LayerType_ELTWISE:
-			return "Eltwise";
-		case V1LayerParameter_LayerType_EXP:
-			return "Exp";
-		case V1LayerParameter_LayerType_FLATTEN:
-			return "Flatten";
-		case V1LayerParameter_LayerType_HDF5_DATA:
-			return "HDF5Data";
-		case V1LayerParameter_LayerType_HDF5_OUTPUT:
-			return "HDF5Output";
-		case V1LayerParameter_LayerType_HINGE_LOSS:
-			return "HingeLoss";
-		case V1LayerParameter_LayerType_IM2COL:
-			return "Im2col";
-		case V1LayerParameter_LayerType_IMAGE_DATA:
-			return "ImageData";
-		case V1LayerParameter_LayerType_INFOGAIN_LOSS:
-			return "InfogainLoss";
-		case V1LayerParameter_LayerType_INNER_PRODUCT:
-			return "InnerProduct";
-		case V1LayerParameter_LayerType_LRN:
-			return "LRN";
-		case V1LayerParameter_LayerType_MEMORY_DATA:
-			return "MemoryData";
-		case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS:
-			return "MultinomialLogisticLoss";
-		case V1LayerParameter_LayerType_MVN:
-			return "MVN";
-		case V1LayerParameter_LayerType_POOLING:
-			return "Pooling";
-		case V1LayerParameter_LayerType_POWER:
-			return "Power";
-		case V1LayerParameter_LayerType_RELU:
-			return "ReLU";
-		case V1LayerParameter_LayerType_SIGMOID:
-			return "Sigmoid";
-		case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS:
-			return "SigmoidCrossEntropyLoss";
-		case V1LayerParameter_LayerType_SILENCE:
-			return "Silence";
-		case V1LayerParameter_LayerType_SOFTMAX:
-			return "Softmax";
-		case V1LayerParameter_LayerType_SOFTMAX_LOSS:
-			return "SoftmaxWithLoss";
-		case V1LayerParameter_LayerType_SPLIT:
-			return "Split";
-		case V1LayerParameter_LayerType_SLICE:
-			return "Slice";
-		case V1LayerParameter_LayerType_TANH:
-			return "TanH";
-		case V1LayerParameter_LayerType_WINDOW_DATA:
-			return "WindowData";
-		case V1LayerParameter_LayerType_THRESHOLD:
-			return "Threshold";
-		default:
-			LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type;
-			return "";
-	}
+  switch (type) {
+  case V1LayerParameter_LayerType_NONE:
+    return "";
+  case V1LayerParameter_LayerType_ABSVAL:
+    return "AbsVal";
+  case V1LayerParameter_LayerType_ACCURACY:
+    return "Accuracy";
+  case V1LayerParameter_LayerType_ARGMAX:
+    return "ArgMax";
+  case V1LayerParameter_LayerType_BNLL:
+    return "BNLL";
+  case V1LayerParameter_LayerType_CONCAT:
+    return "Concat";
+  case V1LayerParameter_LayerType_CONTRASTIVE_LOSS:
+    return "ContrastiveLoss";
+  case V1LayerParameter_LayerType_CONVOLUTION:
+    return "Convolution";
+  case V1LayerParameter_LayerType_DECONVOLUTION:
+    return "Deconvolution";
+  case V1LayerParameter_LayerType_DATA:
+    return "Data";
+  case V1LayerParameter_LayerType_DROPOUT:
+    return "Dropout";
+  case V1LayerParameter_LayerType_DUMMY_DATA:
+    return "DummyData";
+  case V1LayerParameter_LayerType_EUCLIDEAN_LOSS:
+    return "EuclideanLoss";
+  case V1LayerParameter_LayerType_ELTWISE:
+    return "Eltwise";
+  case V1LayerParameter_LayerType_EXP:
+    return "Exp";
+  case V1LayerParameter_LayerType_FLATTEN:
+    return "Flatten";
+  case V1LayerParameter_LayerType_HDF5_DATA:
+    return "HDF5Data";
+  case V1LayerParameter_LayerType_HDF5_OUTPUT:
+    return "HDF5Output";
+  case V1LayerParameter_LayerType_HINGE_LOSS:
+    return "HingeLoss";
+  case V1LayerParameter_LayerType_IM2COL:
+    return "Im2col";
+  case V1LayerParameter_LayerType_IMAGE_DATA:
+    return "ImageData";
+  case V1LayerParameter_LayerType_INFOGAIN_LOSS:
+    return "InfogainLoss";
+  case V1LayerParameter_LayerType_INNER_PRODUCT:
+    return "InnerProduct";
+  case V1LayerParameter_LayerType_LRN:
+    return "LRN";
+  case V1LayerParameter_LayerType_MEMORY_DATA:
+    return "MemoryData";
+  case V1LayerParameter_LayerType_MULTINOMIAL_LOGISTIC_LOSS:
+    return "MultinomialLogisticLoss";
+  case V1LayerParameter_LayerType_MVN:
+    return "MVN";
+  case V1LayerParameter_LayerType_POOLING:
+    return "Pooling";
+  case V1LayerParameter_LayerType_POWER:
+    return "Power";
+  case V1LayerParameter_LayerType_RELU:
+    return "ReLU";
+  case V1LayerParameter_LayerType_SIGMOID:
+    return "Sigmoid";
+  case V1LayerParameter_LayerType_SIGMOID_CROSS_ENTROPY_LOSS:
+    return "SigmoidCrossEntropyLoss";
+  case V1LayerParameter_LayerType_SILENCE:
+    return "Silence";
+  case V1LayerParameter_LayerType_SOFTMAX:
+    return "Softmax";
+  case V1LayerParameter_LayerType_SOFTMAX_LOSS:
+    return "SoftmaxWithLoss";
+  case V1LayerParameter_LayerType_SPLIT:
+    return "Split";
+  case V1LayerParameter_LayerType_SLICE:
+    return "Slice";
+  case V1LayerParameter_LayerType_TANH:
+    return "TanH";
+  case V1LayerParameter_LayerType_WINDOW_DATA:
+    return "WindowData";
+  case V1LayerParameter_LayerType_THRESHOLD:
+    return "Threshold";
+  default:
+    LOG(FATAL) << "Unknown V1LayerParameter layer type: " << type;
+    return "";
+  }
 }
 
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-		NetParameter* param) {
-	CHECK(ReadProtoFromTextFile(param_file, param))
-			<< "Failed to parse NetParameter file: " << param_file;
-	UpgradeNetAsNeeded(param_file, param);
+    NetParameter* param) {
+  CHECK(ReadProtoFromTextFile(param_file, param))
+      << "Failed to parse NetParameter file: " << param_file;
+  UpgradeNetAsNeeded(param_file, param);
 }
 
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-		NetParameter* param) {
-	CHECK(ReadProtoFromBinaryFile(param_file, param))
-			<< "Failed to parse NetParameter file: " << param_file;
-	UpgradeNetAsNeeded(param_file, param);
+    NetParameter* param) {
+  CHECK(ReadProtoFromBinaryFile(param_file, param))
+      << "Failed to parse NetParameter file: " << param_file;
+  UpgradeNetAsNeeded(param_file, param);
 }
 
 }  // namespace caffe

From ae39d5df509451a28b9e920bbd9cfc3b0aea54ad Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sat, 12 Sep 2015 01:48:39 +0800
Subject: [PATCH 099/124] Passed dropout unit test

---
 include/caffe/common.hpp           |  2 +-
 include/caffe/neuron_layers.hpp    |  6 --
 include/caffe/util/ocl_wrapper.hpp |  8 ++-
 src/caffe/common.cpp               |  1 +
 src/caffe/layers/dropout_layer.cpp | 95 ++++++++++--------------------
 src/caffe/ocl/dropout_layer.cl     | 22 +++----
 src/caffe/util/math_functions.cpp  |  1 +
 src/caffe/util/ocl_wrapper.cpp     | 41 +++++++------
 8 files changed, 73 insertions(+), 103 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 0f3a7667..8993af45 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -73,7 +73,7 @@ private:\
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
 //OpenCL:  various of defines to choose the design schemes
 /* ifdef: use CPU random generator in dropout layer
- ifndef: use GPU randome generator*/
+ ifndef: use GPU random generator*/
 //#define use_cpu_generator_dropout
 //#define print_memory_trace
 //the following are macro defines for optimization schmes in conv layer
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 89b6c481..dfbaa199 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -190,12 +190,6 @@ class DropoutLayer: public NeuronLayer<Dtype> {
 		virtual inline const char* type() const {
 			return "Dropout";
 		}
-		virtual ~DropoutLayer();
-		void ocl_setup(int bottom_count);
-		cl_mem MaskMem;
-		cl_kernel ocl_Kernel_Fwd;
-		cl_kernel ocl_Kernel_Bwd;
-		cl_kernel rng_kernel;
 
 	protected:
 		/**
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 869bc83b..5fe5ab9e 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -205,16 +205,18 @@ void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data,
-		const int* MaskMem, const Dtype scale_, Dtype *top_data);
+		const unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype *top_data);
 
 template <typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-		const float threshold_, const Dtype scale_, Dtype* bottom_diff);
+void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem,
+		const unsigned int threshold_, const float scale_, Dtype* bottom_diff);
 
 template <typename Dtype>
 void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
 		Dtype threshold);
 
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0);
+
 template <typename Dtype>
 void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup);
 
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index 2157c96a..20799433 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -105,6 +105,7 @@ Caffe::~Caffe() {
 void Caffe::set_random_seed(const unsigned int seed) {
 	// RNG seed
 	Get().random_generator_.reset(new RNG(seed));
+        caffe_gpu_uniform(0, NULL, seed);
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index de8f5607..05de4944 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -11,15 +11,6 @@
 namespace caffe {
 
 template <typename Dtype>
-void DropoutLayer<Dtype>::ocl_setup(int bottom_count) {
-	MaskMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
-			bottom_count * sizeof(int), NULL, NULL);
-}
-
-template <typename Dtype>
-DropoutLayer<Dtype>::~DropoutLayer() {
-	OCL_CHECK (clReleaseMemObject(MaskMem) );
-	}template <typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 		const vector<Blob<Dtype>*>& top) {
 	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
@@ -28,7 +19,6 @@ void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 	DCHECK(threshold_ < 1.);
 	scale_ = 1. / (1. - threshold_);
 	uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
-	ocl_setup(bottom[0]->count());
 }
 
 template <typename Dtype>
@@ -77,69 +67,44 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 	}
 }
 
-#define CHECK_GLOBAL_INT_MEM_DATA(global_mem, count, num, marker)\
-do{ \
-  int *global_mem_cpu = new int[count]; \
-  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \
-              CL_TRUE, 0, sizeof(int)*count, global_mem_cpu,0, NULL, NULL); \
-  size_t sample_interval = count/num; \
-  if(sample_interval == 0){ \
-     sample_interval=1; \
-  } \
-  printf("%s: ", marker); \
-  for(int i=0; i<count; i+=sample_interval){ \
-      printf("%d  ", global_mem_cpu[i]); \
-  } \
-  printf("\n\n"); \
-  delete []global_mem_cpu; \
-}while(0)
-
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-		const vector<Blob<Dtype>*>& top) {
-	const Dtype* bottom_data = bottom[0]->gpu_data();
-	Dtype* top_data = top[0]->mutable_gpu_data();
-	const int count = bottom[0]->count();
-	if (this->phase_ == TRAIN) {
-		//unsigned int* mask =
-		//  static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
-#ifdef use_cpu_generator_dropout 
-		unsigned int* mask_cpu =
-		static_cast<unsigned int*>(rand_vec_.mutable_cpu_data());
-		caffe_rng_bernoulli(count, 1. - threshold_, mask_cpu);
-		OCL_CHECK( clEnqueueWriteBuffer(amdDevice.CommandQueue, MaskMem, CL_TRUE, 0, count * sizeof(int), (void*)mask_cpu, 0, NULL, NULL) );
-		DropoutForward(count, bottom_data, (int*)MaskMem, (Dtype)scale_, top_data);
-#else
-		caffe_gpu_bernoulli((int*) MaskMem, count, (Dtype) 0., (Dtype) 1.,
-				threshold_);
-		DropoutForward(count, bottom_data, (int*) MaskMem, (Dtype) scale_,
-				top_data);
-#endif
-	} else {
-		caffe_gpu_copy(count, bottom_data, top_data);
-	}
-CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  if (this->phase_ == TRAIN) {
+    unsigned int* mask =
+        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+    caffe_gpu_rng_uniform(count, mask);
+    // set thresholds
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data);
+  } else {
+    caffe_gpu_copy(count, bottom_data, top_data);
+  }
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-		const vector<bool>& propagate_down,
-		const vector<Blob<Dtype>*>& bottom) {
-	if (propagate_down[0]) {
-		const Dtype* top_diff = top[0]->gpu_diff();
-		Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-		if (this->phase_ == TRAIN) {
-			const int count = bottom[0]->count();
-			DropoutBackward(count, top_diff, (int*) MaskMem, uint_thres_,
-					(Dtype) scale_, bottom_diff);
-		} else {
-			caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
-		}
-               CHECK_GLOBAL_INT_MEM_DATA((int*)MaskMem, bottom[0]->count(), 20, "Mask");
-               CHECK_GLOBAL_MEM_DATA(bottom_diff, bottom[0]->count(), 20, "bottom_diff");
-	}
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    if (this->phase_ == TRAIN) {
+      const unsigned int* mask =
+          static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      const int count = bottom[0]->count();
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+    } else {
+      caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
 }
 
+
 #ifdef CPU_ONLY
 STUB_GPU(DropoutLayer);
 #endif
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
index bb2fc696..98d44f86 100644
--- a/src/caffe/ocl/dropout_layer.cl
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -25,19 +25,21 @@
  **************************************************************************************/
 
 template <class T>
-__kernel void DropoutForward(const int n, __global T *in, __global const int* mask, const T scale, __global T *out) {
+__kernel void DropoutForward(const int n, __global T *in, __global const unsigned int* mask, const unsigned int threshold,  const float scale, __global T *out) {
 	int index = get_global_id(0);
-	if (index < n)
-	out[index] = in[index] * scale * mask[index];
+	if (index < n) {
+	    out[index] = in[index] * scale * (mask[index] > threshold);
+        }
 }
-template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const int* mask, const float scale, __global float* out);
-template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const int* mask, const double scale, __global double* out);
+template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out);
+template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out);
 
 template <class T>
-__kernel void DropoutBackward(const int n, __global T *in_diff, __global const int *mask, const int unsigned threshold, const T scale, __global T *out_diff) {
+__kernel void DropoutBackward(const int n, __global T *in_diff, __global const unsigned int *mask, const unsigned int threshold, const float scale, __global T *out_diff) {
 	int index = get_global_id(0);
-	if (index < n)
-	out_diff[index] = in_diff[index] * scale * mask[index];
+	if (index < n) {
+	    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
+        }
 }
-template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const int* mask, const unsigned int threshold, const float scale, __global float* out_diff);
-template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const int* mask, const unsigned int threshold, const double scale, __global double* out_diff);
+template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out_diff);
+template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out_diff);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 3275d75c..6b76a9ef 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -886,6 +886,7 @@ uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
+        caffe_gpu_uniform(n, r);
 }
 
 template <>
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 75b69215..29a12330 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -145,12 +145,16 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
 template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup);
 template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup);
 
-void caffe_gpu_uniform(const unsigned int n, unsigned int *r)
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed)
 {
+        static unsigned c = 0;
+        if ((n == 0) || (r == NULL)) {
+            c = _seed;
+            return;
+        }
         std::string kernel_name = "PRNG_threefry4x32_uint_uniform";
         cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
 
-        static unsigned c = 0;
         unsigned nrounds = 20;
         array4x32  rndctr4;
         rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
@@ -1692,33 +1696,33 @@ template void caffe_gpu_powx<double>(const int n, const double* a,
 
 template <typename Dtype>
 void DropoutForward(const int count, const Dtype* bottom_data,
-		const int* MaskMem, const Dtype scale_, Dtype* top_data) {
+		const  unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype* top_data) {
 	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
 	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
 	cl_int ret;
-	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret = clSetKernelArg(kernel,  0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
 	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(Dtype), (void*) &scale_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold);
+        ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &top_data);
 	OCL_CHECK(ret);
 
 	size_t Global_Work_Size[] = { (size_t) count };
 	size_t Local_Work_Size[] = { 256 };
-	OCL_CHECK(
-			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+	OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
 					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 
 template void DropoutForward<float>(const int count, const float* bottom_data,
-		const int* MaskMem, const float scale_, float* top_data);
+		const unsigned int* MaskMem, const unsigned int threshold, const float scale_, float* top_data);
 template void DropoutForward<double>(const int count, const double* bottom_data,
-		const int* MaskMem, const double scale_, double* top_data);
+		const unsigned int* MaskMem, const unsigned int threshold, const float scale_, double* top_data);
 
 template <typename Dtype>
-void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
-		const float threshold_, const Dtype scale_, Dtype* bottom_diff) {
+void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem,
+		const unsigned int threshold_, const float scale_, Dtype* bottom_diff) {
 	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
 	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
 
@@ -1726,8 +1730,8 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
 	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
 	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
 	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
-	ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &threshold_);
-	ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_);
 	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
 	OCL_CHECK(ret);
 
@@ -1738,10 +1742,10 @@ void DropoutBackward(const int count, const Dtype* top_diff, const int* MaskMem,
 					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
 }
 template void DropoutBackward<float>(const int count, const float* top_diff,
-		const int* MaskMem, const float threshold_, const float scale_,
+		const unsigned int* MaskMem, const unsigned int threshold_, const float scale_,
 		float* bottom_diff);
 template void DropoutBackward<double>(const int count, const double* top_diff,
-		const int* MaskMem, const float threshold_, const double scale_,
+		const unsigned int* MaskMem, const unsigned int  threshold_, const float scale_,
 		double* bottom_diff);
 
 template <typename Dtype>
@@ -1927,7 +1931,8 @@ template void ocl_conv<float>(float* bottom_data, float* top_data,
 		int stride, int pad, int batch_sz);
 template void ocl_conv<double>(double* bottom_data, double* top_data,
 		double* weights, double* bias, int channel_in, int width, int height,
-		int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
-		int stride, int pad, int batch_sz);
+                int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, 
+                int stride, int pad, int batch_sz);
 
 }  // namespace caffe
+

From cb7cd7bbde88907e0a97d5499fc9c4ba07cd0767 Mon Sep 17 00:00:00 2001
From: Junli <unli.Gu@amd.com>
Date: Sat, 12 Sep 2015 21:55:30 -0700
Subject: [PATCH 100/124] removed cmakefiles from git repo

---
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/caffe.dir/DependInfo.cmake     |  108 -
 src/caffe/CMakeFiles/caffe.dir/build.make     | 2542 -----------------
 .../CMakeFiles/caffe.dir/cmake_clean.cmake    |  126 -
 src/caffe/CMakeFiles/caffe.dir/depend.make    |    2 -
 src/caffe/CMakeFiles/caffe.dir/flags.make     |    8 -
 src/caffe/CMakeFiles/caffe.dir/link.txt       |    1 -
 src/caffe/CMakeFiles/caffe.dir/progress.make  |  118 -
 ..._compile_generated_absval_layer.cu.o.cmake |  296 --
 ...compile_generated_absval_layer.cu.o.depend |    1 -
 ...mpile_generated_base_data_layer.cu.o.cmake |  296 --
 ...pile_generated_base_data_layer.cu.o.depend |    1 -
 ...da_compile_generated_bnll_layer.cu.o.cmake |  296 --
 ...a_compile_generated_bnll_layer.cu.o.depend |    1 -
 ..._compile_generated_concat_layer.cu.o.cmake |  296 --
 ...compile_generated_concat_layer.cu.o.depend |    1 -
 ...enerated_contrastive_loss_layer.cu.o.cmake |  296 --
 ...nerated_contrastive_loss_layer.cu.o.depend |    1 -
 ...da_compile_generated_conv_layer.cu.o.cmake |  296 --
 ...a_compile_generated_conv_layer.cu.o.depend |    1 -
 ...pile_generated_cudnn_conv_layer.cu.o.cmake |  296 --
 ...ile_generated_cudnn_conv_layer.cu.o.depend |    1 -
 ...e_generated_cudnn_pooling_layer.cu.o.cmake |  296 --
 ..._generated_cudnn_pooling_layer.cu.o.depend |    1 -
 ...pile_generated_cudnn_relu_layer.cu.o.cmake |  296 --
 ...ile_generated_cudnn_relu_layer.cu.o.depend |    1 -
 ...e_generated_cudnn_sigmoid_layer.cu.o.cmake |  296 --
 ..._generated_cudnn_sigmoid_layer.cu.o.depend |    1 -
 ...e_generated_cudnn_softmax_layer.cu.o.cmake |  296 --
 ..._generated_cudnn_softmax_layer.cu.o.depend |    1 -
 ...pile_generated_cudnn_tanh_layer.cu.o.cmake |  296 --
 ...ile_generated_cudnn_tanh_layer.cu.o.depend |    1 -
 ..._compile_generated_deconv_layer.cu.o.cmake |  296 --
 ...compile_generated_deconv_layer.cu.o.depend |    1 -
 ...compile_generated_dropout_layer.cu.o.cmake |  296 --
 ...ompile_generated_dropout_layer.cu.o.depend |    1 -
 ...compile_generated_eltwise_layer.cu.o.cmake |  296 --
 ...ompile_generated_eltwise_layer.cu.o.depend |    1 -
 ..._generated_euclidean_loss_layer.cu.o.cmake |  296 --
 ...generated_euclidean_loss_layer.cu.o.depend |    1 -
 ...uda_compile_generated_exp_layer.cu.o.cmake |  296 --
 ...da_compile_generated_exp_layer.cu.o.depend |    1 -
 ..._compile_generated_filter_layer.cu.o.cmake |  296 --
 ...compile_generated_filter_layer.cu.o.depend |    1 -
 ...mpile_generated_hdf5_data_layer.cu.o.cmake |  296 --
 ...pile_generated_hdf5_data_layer.cu.o.depend |    1 -
 ...ile_generated_hdf5_output_layer.cu.o.cmake |  296 --
 ...le_generated_hdf5_output_layer.cu.o.depend |    1 -
 ..._compile_generated_im2col_layer.cu.o.cmake |  296 --
 ...compile_generated_im2col_layer.cu.o.depend |    1 -
 ...e_generated_inner_product_layer.cu.o.cmake |  296 --
 ..._generated_inner_product_layer.cu.o.depend |    1 -
 ...uda_compile_generated_log_layer.cu.o.cmake |  296 --
 ...da_compile_generated_log_layer.cu.o.depend |    1 -
 ...uda_compile_generated_lrn_layer.cu.o.cmake |  296 --
 ...da_compile_generated_lrn_layer.cu.o.depend |    1 -
 ...uda_compile_generated_mvn_layer.cu.o.cmake |  296 --
 ...da_compile_generated_mvn_layer.cu.o.depend |    1 -
 ...compile_generated_pooling_layer.cu.o.cmake |  296 --
 ...ompile_generated_pooling_layer.cu.o.depend |    1 -
 ...a_compile_generated_power_layer.cu.o.cmake |  296 --
 ..._compile_generated_power_layer.cu.o.depend |    1 -
 ...a_compile_generated_prelu_layer.cu.o.cmake |  296 --
 ..._compile_generated_prelu_layer.cu.o.depend |    1 -
 ...mpile_generated_reduction_layer.cu.o.cmake |  296 --
 ...pile_generated_reduction_layer.cu.o.depend |    1 -
 ...da_compile_generated_relu_layer.cu.o.cmake |  296 --
 ...a_compile_generated_relu_layer.cu.o.depend |    1 -
 ...igmoid_cross_entropy_loss_layer.cu.o.cmake |  296 --
 ...gmoid_cross_entropy_loss_layer.cu.o.depend |  470 ---
 ...compile_generated_sigmoid_layer.cu.o.cmake |  296 --
 ...ompile_generated_sigmoid_layer.cu.o.depend |  468 ---
 ...compile_generated_silence_layer.cu.o.cmake |  296 --
 ...ompile_generated_silence_layer.cu.o.depend |    1 -
 ...a_compile_generated_slice_layer.cu.o.cmake |  296 --
 ..._compile_generated_slice_layer.cu.o.depend |    1 -
 ...compile_generated_softmax_layer.cu.o.cmake |  296 --
 ...ompile_generated_softmax_layer.cu.o.depend |    1 -
 ...le_generated_softmax_loss_layer.cu.o.cmake |  296 --
 ...e_generated_softmax_loss_layer.cu.o.depend |    1 -
 ...a_compile_generated_split_layer.cu.o.cmake |  296 --
 ..._compile_generated_split_layer.cu.o.depend |    1 -
 ...da_compile_generated_tanh_layer.cu.o.cmake |  296 --
 ...a_compile_generated_tanh_layer.cu.o.depend |    1 -
 ...mpile_generated_threshold_layer.cu.o.cmake |  296 --
 ...pile_generated_threshold_layer.cu.o.depend |    1 -
 .../cuda_compile_generated_im2col.cu.o.cmake  |  296 --
 .../cuda_compile_generated_im2col.cu.o.depend |  404 ---
 ...ompile_generated_math_functions.cu.o.cmake |  296 --
 ...mpile_generated_math_functions.cu.o.depend |  744 -----
 src/caffe/CMakeFiles/progress.marks           |    1 -
 .../CMakeFiles/proto.dir/CXX.includecache     |   48 -
 .../CMakeFiles/proto.dir/DependInfo.cmake     |   39 -
 src/caffe/CMakeFiles/proto.dir/build.make     |  119 -
 .../CMakeFiles/proto.dir/cmake_clean.cmake    |   13 -
 .../proto.dir/cmake_clean_target.cmake        |    3 -
 .../CMakeFiles/proto.dir/depend.internal      |    6 -
 src/caffe/CMakeFiles/proto.dir/depend.make    |    6 -
 src/caffe/CMakeFiles/proto.dir/flags.make     |    8 -
 src/caffe/CMakeFiles/proto.dir/link.txt       |    2 -
 src/caffe/CMakeFiles/proto.dir/progress.make  |    3 -
 src/caffe/CMakeLists.txt                      |   36 -
 102 files changed, 17464 deletions(-)
 delete mode 100644 src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/build.make
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/depend.make
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/flags.make
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/link.txt
 delete mode 100644 src/caffe/CMakeFiles/caffe.dir/progress.make
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
 delete mode 100644 src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
 delete mode 100644 src/caffe/CMakeFiles/progress.marks
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/CXX.includecache
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/build.make
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/depend.internal
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/depend.make
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/flags.make
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/link.txt
 delete mode 100644 src/caffe/CMakeFiles/proto.dir/progress.make
 delete mode 100644 src/caffe/CMakeLists.txt

diff --git a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 7bb0014c..00000000
--- a/src/caffe/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# Relative path conversion top directories.
-SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-
-# Force unix paths in dependencies.
-SET(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake b/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
deleted file mode 100644
index 1678bc46..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake
+++ /dev/null
@@ -1,108 +0,0 @@
-# The set of languages for which implicit dependencies are needed:
-SET(CMAKE_DEPENDS_LANGUAGES
-  "CXX"
-  )
-# The set of files for implicit dependencies of each language:
-SET(CMAKE_DEPENDS_CHECK_CXX
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/blob.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/common.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/device.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/net.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/solver.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
-  )
-SET(CMAKE_CXX_COMPILER_ID "GNU")
-
-# Preprocessor definitions for this target.
-SET(CMAKE_TARGET_DEFINITIONS
-  "GTEST_USE_OWN_TR1_TUPLE"
-  )
-
-# Targets to which this target links.
-SET(CMAKE_TARGET_LINKED_INFO_FILES
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake"
-  )
-
-# The include file search paths:
-SET(CMAKE_C_TARGET_INCLUDE_PATH
-  "src"
-  "/usr/local/include"
-  "include"
-  "/usr/local/cuda/include"
-  "/usr/local/include/opencv"
-  "/usr/include/atlas"
-  "."
-  )
-SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/CMakeFiles/caffe.dir/build.make b/src/caffe/CMakeFiles/caffe.dir/build.make
deleted file mode 100644
index 916913ae..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/build.make
+++ /dev/null
@@ -1,2542 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# Include any dependencies generated for this target.
-include src/caffe/CMakeFiles/caffe.dir/depend.make
-
-# Include the progress variables for this target.
-include src/caffe/CMakeFiles/caffe.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include src/caffe/CMakeFiles/caffe.dir/flags.make
-
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o: src/caffe/util/math_functions.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o: src/caffe/util/im2col.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o: src/caffe/layers/cufiles/sigmoid_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o: src/caffe/layers/cufiles/bnll_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o: src/caffe/layers/cufiles/conv_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o: src/caffe/layers/cufiles/pooling_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o: src/caffe/layers/cufiles/log_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o: src/caffe/layers/cufiles/reduction_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o: src/caffe/layers/cufiles/silence_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o: src/caffe/layers/cufiles/power_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o: src/caffe/layers/cufiles/split_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o: src/caffe/layers/cufiles/absval_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o: src/caffe/layers/cufiles/hdf5_output_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o: src/caffe/layers/cufiles/base_data_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o: src/caffe/layers/cufiles/dropout_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o: src/caffe/layers/cufiles/cudnn_tanh_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o: src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o: src/caffe/layers/cufiles/relu_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o: src/caffe/layers/cufiles/cudnn_conv_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o: src/caffe/layers/cufiles/contrastive_loss_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o: src/caffe/layers/cufiles/concat_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o: src/caffe/layers/cufiles/softmax_loss_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o: src/caffe/layers/cufiles/cudnn_softmax_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o: src/caffe/layers/cufiles/inner_product_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o: src/caffe/layers/cufiles/filter_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o: src/caffe/layers/cufiles/prelu_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o: src/caffe/layers/cufiles/im2col_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o: src/caffe/layers/cufiles/hdf5_data_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o: src/caffe/layers/cufiles/deconv_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o: src/caffe/layers/cufiles/mvn_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o: src/caffe/layers/cufiles/tanh_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o: src/caffe/layers/cufiles/slice_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o: src/caffe/layers/cufiles/threshold_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o: src/caffe/layers/cufiles/lrn_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o: src/caffe/layers/cufiles/eltwise_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o: src/caffe/layers/cufiles/exp_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o: src/caffe/layers/cufiles/euclidean_loss_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o: src/caffe/layers/cufiles/cudnn_relu_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o: src/caffe/layers/cufiles/cudnn_pooling_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o: src/caffe/layers/cufiles/softmax_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
-src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o: src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/common.cpp.o: src/caffe/common.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/common.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/common.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp > CMakeFiles/caffe.dir/common.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/common.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/common.cpp -o CMakeFiles/caffe.dir/common.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/common.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/common.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.o: src/caffe/blob.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/blob.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/blob.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp > CMakeFiles/caffe.dir/blob.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/blob.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/blob.cpp -o CMakeFiles/caffe.dir/blob.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o: src/caffe/util/ocl_wrapper.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp > CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_wrapper.cpp -o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o: src/caffe/util/im2col.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/im2col.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/im2col.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp > CMakeFiles/caffe.dir/util/im2col.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/im2col.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cpp -o CMakeFiles/caffe.dir/util/im2col.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o: src/caffe/util/upgrade_proto.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp > CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/upgrade_proto.cpp -o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o: src/caffe/util/db_leveldb.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_leveldb.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp > CMakeFiles/caffe.dir/util/db_leveldb.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_leveldb.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_leveldb.cpp -o CMakeFiles/caffe.dir/util/db_leveldb.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o: src/caffe/util/ocl_util.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/ocl_util.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/ocl_util.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp > CMakeFiles/caffe.dir/util/ocl_util.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/ocl_util.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/ocl_util.cpp -o CMakeFiles/caffe.dir/util/ocl_util.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o: src/caffe/util/insert_splits.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/insert_splits.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/insert_splits.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp > CMakeFiles/caffe.dir/util/insert_splits.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/insert_splits.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/insert_splits.cpp -o CMakeFiles/caffe.dir/util/insert_splits.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o: src/caffe/util/db_lmdb.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db_lmdb.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp > CMakeFiles/caffe.dir/util/db_lmdb.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db_lmdb.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db_lmdb.cpp -o CMakeFiles/caffe.dir/util/db_lmdb.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o: src/caffe/util/math_functions.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/math_functions.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp > CMakeFiles/caffe.dir/util/math_functions.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/math_functions.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cpp -o CMakeFiles/caffe.dir/util/math_functions.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o: src/caffe/util/io.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/io.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp > CMakeFiles/caffe.dir/util/io.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/io.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/io.cpp -o CMakeFiles/caffe.dir/util/io.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o: src/caffe/util/cudnn.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/cudnn.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/cudnn.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp > CMakeFiles/caffe.dir/util/cudnn.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/cudnn.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/cudnn.cpp -o CMakeFiles/caffe.dir/util/cudnn.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o: src/caffe/util/db.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/db.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp > CMakeFiles/caffe.dir/util/db.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/db.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/db.cpp -o CMakeFiles/caffe.dir/util/db.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o: src/caffe/util/benchmark.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/util/benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/util/benchmark.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp > CMakeFiles/caffe.dir/util/benchmark.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/util/benchmark.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/benchmark.cpp -o CMakeFiles/caffe.dir/util/benchmark.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/device.cpp.o: src/caffe/device.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/device.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/device.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/device.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp > CMakeFiles/caffe.dir/device.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/device.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/device.cpp -o CMakeFiles/caffe.dir/device.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/device.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/device.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o: src/caffe/internal_thread.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/internal_thread.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp > CMakeFiles/caffe.dir/internal_thread.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/internal_thread.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/internal_thread.cpp -o CMakeFiles/caffe.dir/internal_thread.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o: src/caffe/data_transformer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/data_transformer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp > CMakeFiles/caffe.dir/data_transformer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/data_transformer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/data_transformer.cpp -o CMakeFiles/caffe.dir/data_transformer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/net.cpp.o: src/caffe/net.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/net.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/net.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp > CMakeFiles/caffe.dir/net.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/net.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/net.cpp -o CMakeFiles/caffe.dir/net.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/net.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/net.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.o: src/caffe/solver.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_60)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/solver.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/solver.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp > CMakeFiles/caffe.dir/solver.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/solver.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/solver.cpp -o CMakeFiles/caffe.dir/solver.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o: src/caffe/layer_factory.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_61)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layer_factory.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp > CMakeFiles/caffe.dir/layer_factory.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layer_factory.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layer_factory.cpp -o CMakeFiles/caffe.dir/layer_factory.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o: src/caffe/syncedmem.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_62)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/syncedmem.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp > CMakeFiles/caffe.dir/syncedmem.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/syncedmem.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/syncedmem.cpp -o CMakeFiles/caffe.dir/syncedmem.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o: src/caffe/layers/deconv_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_63)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp > CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/deconv_layer.cpp -o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o: src/caffe/layers/infogain_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_64)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp > CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/infogain_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o: src/caffe/layers/log_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_65)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/log_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/log_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp > CMakeFiles/caffe.dir/layers/log_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/log_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/log_layer.cpp -o CMakeFiles/caffe.dir/layers/log_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o: src/caffe/layers/base_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_66)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp > CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_data_layer.cpp -o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o: src/caffe/layers/euclidean_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_67)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp > CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/euclidean_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o: src/caffe/layers/image_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_68)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp > CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/image_data_layer.cpp -o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o: src/caffe/layers/sigmoid_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_69)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o: src/caffe/layers/cudnn_softmax_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_70)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o: src/caffe/layers/cudnn_tanh_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_71)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o: src/caffe/layers/spp_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_72)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/spp_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp > CMakeFiles/caffe.dir/layers/spp_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/spp_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/spp_layer.cpp -o CMakeFiles/caffe.dir/layers/spp_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o: src/caffe/layers/hdf5_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_73)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_data_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o: src/caffe/layers/exp_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_74)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/exp_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp > CMakeFiles/caffe.dir/layers/exp_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/exp_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/exp_layer.cpp -o CMakeFiles/caffe.dir/layers/exp_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o: src/caffe/layers/power_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_75)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/power_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp > CMakeFiles/caffe.dir/layers/power_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/power_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/power_layer.cpp -o CMakeFiles/caffe.dir/layers/power_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o: src/caffe/layers/relu_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_76)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/relu_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp > CMakeFiles/caffe.dir/layers/relu_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/relu_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/relu_layer.cpp -o CMakeFiles/caffe.dir/layers/relu_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o: src/caffe/layers/split_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_77)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/split_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp > CMakeFiles/caffe.dir/layers/split_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/split_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/split_layer.cpp -o CMakeFiles/caffe.dir/layers/split_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o: src/caffe/layers/window_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_78)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp > CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/window_data_layer.cpp -o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o: src/caffe/layers/dropout_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_79)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp > CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dropout_layer.cpp -o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o: src/caffe/layers/cudnn_sigmoid_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_80)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_sigmoid_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o: src/caffe/layers/silence_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_81)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/silence_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp > CMakeFiles/caffe.dir/layers/silence_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/silence_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/silence_layer.cpp -o CMakeFiles/caffe.dir/layers/silence_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o: src/caffe/layers/cudnn_pooling_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_82)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o: src/caffe/layers/lrn_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_83)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp > CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/lrn_layer.cpp -o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o: src/caffe/layers/memory_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_84)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp > CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/memory_data_layer.cpp -o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o: src/caffe/layers/mvn_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_85)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp > CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/mvn_layer.cpp -o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o: src/caffe/layers/cudnn_relu_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_86)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_relu_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o: src/caffe/layers/slice_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_87)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/slice_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp > CMakeFiles/caffe.dir/layers/slice_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/slice_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/slice_layer.cpp -o CMakeFiles/caffe.dir/layers/slice_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o: src/caffe/layers/pooling_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_88)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp > CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/pooling_layer.cpp -o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o: src/caffe/layers/hdf5_output_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_89)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp > CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hdf5_output_layer.cpp -o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o: src/caffe/layers/inner_product_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_90)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp > CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/inner_product_layer.cpp -o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o: src/caffe/layers/threshold_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_91)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp > CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/threshold_layer.cpp -o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o: src/caffe/layers/reduction_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_92)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp > CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reduction_layer.cpp -o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o: src/caffe/layers/tanh_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_93)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp > CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/tanh_layer.cpp -o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o: src/caffe/layers/prelu_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_94)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp > CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/prelu_layer.cpp -o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o: src/caffe/layers/accuracy_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_95)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp > CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/accuracy_layer.cpp -o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o: src/caffe/layers/neuron_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_96)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp > CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/neuron_layer.cpp -o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o: src/caffe/layers/absval_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_97)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/absval_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp > CMakeFiles/caffe.dir/layers/absval_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/absval_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/absval_layer.cpp -o CMakeFiles/caffe.dir/layers/absval_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o: src/caffe/layers/loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_98)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp > CMakeFiles/caffe.dir/layers/loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/loss_layer.cpp -o CMakeFiles/caffe.dir/layers/loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o: src/caffe/layers/softmax_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_99)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o: src/caffe/layers/cudnn_conv_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_100)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp > CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cudnn_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_101)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o: src/caffe/layers/concat_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_102)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/concat_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp > CMakeFiles/caffe.dir/layers/concat_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/concat_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/concat_layer.cpp -o CMakeFiles/caffe.dir/layers/concat_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o: src/caffe/layers/hinge_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_103)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp > CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/hinge_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o: src/caffe/layers/bnll_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_104)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp > CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/bnll_layer.cpp -o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o: src/caffe/layers/flatten_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_105)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp > CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/flatten_layer.cpp -o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o: src/caffe/layers/argmax_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_106)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp > CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/argmax_layer.cpp -o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o: src/caffe/layers/filter_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_107)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/filter_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp > CMakeFiles/caffe.dir/layers/filter_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/filter_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/filter_layer.cpp -o CMakeFiles/caffe.dir/layers/filter_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o: src/caffe/layers/dummy_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_108)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp > CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/dummy_data_layer.cpp -o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o: src/caffe/layers/conv_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_109)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/conv_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp > CMakeFiles/caffe.dir/layers/conv_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/conv_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/conv_layer.cpp -o CMakeFiles/caffe.dir/layers/conv_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o: src/caffe/layers/base_conv_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_110)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp > CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/base_conv_layer.cpp -o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o: src/caffe/layers/data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_111)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp > CMakeFiles/caffe.dir/layers/data_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/data_layer.cpp -o CMakeFiles/caffe.dir/layers/data_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o: src/caffe/layers/softmax_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_112)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp > CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/softmax_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o: src/caffe/layers/eltwise_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_113)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp > CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/eltwise_layer.cpp -o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o: src/caffe/layers/im2col_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_114)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp > CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/im2col_layer.cpp -o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o: src/caffe/layers/multinomial_logistic_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_115)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp > CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/multinomial_logistic_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o: src/caffe/layers/contrastive_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_116)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp > CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/contrastive_loss_layer.cpp -o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/CMakeFiles/caffe.dir/flags.make
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o: src/caffe/layers/reshape_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_117)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp > CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/reshape_layer.cpp -o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires:
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build
-.PHONY : src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides
-
-src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.provides.build: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
-
-# Object files for target caffe
-caffe_OBJECTS = \
-"CMakeFiles/caffe.dir/common.cpp.o" \
-"CMakeFiles/caffe.dir/blob.cpp.o" \
-"CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o" \
-"CMakeFiles/caffe.dir/util/im2col.cpp.o" \
-"CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o" \
-"CMakeFiles/caffe.dir/util/db_leveldb.cpp.o" \
-"CMakeFiles/caffe.dir/util/ocl_util.cpp.o" \
-"CMakeFiles/caffe.dir/util/insert_splits.cpp.o" \
-"CMakeFiles/caffe.dir/util/db_lmdb.cpp.o" \
-"CMakeFiles/caffe.dir/util/math_functions.cpp.o" \
-"CMakeFiles/caffe.dir/util/io.cpp.o" \
-"CMakeFiles/caffe.dir/util/cudnn.cpp.o" \
-"CMakeFiles/caffe.dir/util/db.cpp.o" \
-"CMakeFiles/caffe.dir/util/benchmark.cpp.o" \
-"CMakeFiles/caffe.dir/device.cpp.o" \
-"CMakeFiles/caffe.dir/internal_thread.cpp.o" \
-"CMakeFiles/caffe.dir/data_transformer.cpp.o" \
-"CMakeFiles/caffe.dir/net.cpp.o" \
-"CMakeFiles/caffe.dir/solver.cpp.o" \
-"CMakeFiles/caffe.dir/layer_factory.cpp.o" \
-"CMakeFiles/caffe.dir/syncedmem.cpp.o" \
-"CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/log_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/spp_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/exp_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/power_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/relu_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/split_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/silence_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/slice_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/absval_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/concat_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/filter_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/conv_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/data_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o" \
-"CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
-
-# External object files for target caffe
-caffe_EXTERNAL_OBJECTS = \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o" \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
-
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/common.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/device.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/net.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/build.make
-lib/libcaffe.so: lib/libproto.a
-lib/libcaffe.so: lib/libproto.a
-lib/libcaffe.so: /usr/local/lib/libboost_system.so
-lib/libcaffe.so: /usr/local/lib/libboost_thread.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libpthread.so
-lib/libcaffe.so: /usr/local/lib/libglog.so
-lib/libcaffe.so: /usr/local/lib/libgflags.a
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so
-lib/libcaffe.so: /usr/local/lib/liblmdb.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so
-lib/libcaffe.so: /usr/lib/libsnappy.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so
-lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10
-lib/libcaffe.so: /usr/local/lib/libopencv_highgui.so.2.4.10
-lib/libcaffe.so: /usr/local/lib/libopencv_imgproc.so.2.4.10
-lib/libcaffe.so: /usr/lib/liblapack_atlas.so
-lib/libcaffe.so: /usr/lib/libcblas.so
-lib/libcaffe.so: /usr/lib/libatlas.so
-lib/libcaffe.so: /usr/local/lib/libglog.so
-lib/libcaffe.so: /usr/local/lib/libgflags.a
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libprotobuf.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libhdf5.so
-lib/libcaffe.so: /usr/local/lib/liblmdb.so
-lib/libcaffe.so: /usr/lib/x86_64-linux-gnu/libleveldb.so
-lib/libcaffe.so: /usr/lib/libsnappy.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcudart.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcurand.so
-lib/libcaffe.so: /usr/local/cuda/lib64/libcublas.so
-lib/libcaffe.so: /usr/lib/liblapack_atlas.so
-lib/libcaffe.so: /usr/lib/libcblas.so
-lib/libcaffe.so: /usr/lib/libatlas.so
-lib/libcaffe.so: /usr/local/lib/libopencv_core.so.2.4.10
-lib/libcaffe.so: src/caffe/CMakeFiles/caffe.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX shared library ../../lib/libcaffe.so"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/caffe.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-src/caffe/CMakeFiles/caffe.dir/build: lib/libcaffe.so
-.PHONY : src/caffe/CMakeFiles/caffe.dir/build
-
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/common.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/blob.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/device.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/net.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/solver.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o.requires
-src/caffe/CMakeFiles/caffe.dir/requires: src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o.requires
-.PHONY : src/caffe/CMakeFiles/caffe.dir/requires
-
-src/caffe/CMakeFiles/caffe.dir/clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/caffe.dir/cmake_clean.cmake
-.PHONY : src/caffe/CMakeFiles/caffe.dir/clean
-
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o
-src/caffe/CMakeFiles/caffe.dir/depend: src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : src/caffe/CMakeFiles/caffe.dir/depend
-
diff --git a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
deleted file mode 100644
index 344db002..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,126 +0,0 @@
-FILE(REMOVE_RECURSE
-  "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o"
-  "CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o"
-  "CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o"
-  "CMakeFiles/caffe.dir/common.cpp.o"
-  "CMakeFiles/caffe.dir/blob.cpp.o"
-  "CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o"
-  "CMakeFiles/caffe.dir/util/im2col.cpp.o"
-  "CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o"
-  "CMakeFiles/caffe.dir/util/db_leveldb.cpp.o"
-  "CMakeFiles/caffe.dir/util/ocl_util.cpp.o"
-  "CMakeFiles/caffe.dir/util/insert_splits.cpp.o"
-  "CMakeFiles/caffe.dir/util/db_lmdb.cpp.o"
-  "CMakeFiles/caffe.dir/util/math_functions.cpp.o"
-  "CMakeFiles/caffe.dir/util/io.cpp.o"
-  "CMakeFiles/caffe.dir/util/cudnn.cpp.o"
-  "CMakeFiles/caffe.dir/util/db.cpp.o"
-  "CMakeFiles/caffe.dir/util/benchmark.cpp.o"
-  "CMakeFiles/caffe.dir/device.cpp.o"
-  "CMakeFiles/caffe.dir/internal_thread.cpp.o"
-  "CMakeFiles/caffe.dir/data_transformer.cpp.o"
-  "CMakeFiles/caffe.dir/net.cpp.o"
-  "CMakeFiles/caffe.dir/solver.cpp.o"
-  "CMakeFiles/caffe.dir/layer_factory.cpp.o"
-  "CMakeFiles/caffe.dir/syncedmem.cpp.o"
-  "CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/log_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/spp_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/exp_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/power_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/relu_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/split_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/silence_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/slice_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/absval_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/concat_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/filter_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/conv_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/data_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o"
-  "CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o"
-  "../../lib/libcaffe.pdb"
-  "../../lib/libcaffe.so"
-)
-
-# Per-language clean rules from dependency scanning.
-FOREACH(lang CXX)
-  INCLUDE(CMakeFiles/caffe.dir/cmake_clean_${lang}.cmake OPTIONAL)
-ENDFOREACH(lang)
diff --git a/src/caffe/CMakeFiles/caffe.dir/depend.make b/src/caffe/CMakeFiles/caffe.dir/depend.make
deleted file mode 100644
index 0b20d16b..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for caffe.
-# This may be replaced when dependencies are built.
diff --git a/src/caffe/CMakeFiles/caffe.dir/flags.make b/src/caffe/CMakeFiles/caffe.dir/flags.make
deleted file mode 100644
index 494d36e8..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/flags.make
+++ /dev/null
@@ -1,8 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# compile CXX with /usr/bin/c++
-CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -fPIC -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
-
-CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE -Dcaffe_EXPORTS
-
diff --git a/src/caffe/CMakeFiles/caffe.dir/link.txt b/src/caffe/CMakeFiles/caffe.dir/link.txt
deleted file mode 100644
index 603d461f..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/usr/bin/c++  -fPIC  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG   -shared -Wl,-soname,libcaffe.so -o ../../lib/libcaffe.so CMakeFiles/caffe.dir/common.cpp.o CMakeFiles/caffe.dir/blob.cpp.o CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o CMakeFiles/caffe.dir/util/im2col.cpp.o CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o CMakeFiles/caffe.dir/util/db_leveldb.cpp.o CMakeFiles/caffe.dir/util/ocl_util.cpp.o CMakeFiles/caffe.dir/util/insert_splits.cpp.o CMakeFiles/caffe.dir/util/db_lmdb.cpp.o CMakeFiles/caffe.dir/util/math_functions.cpp.o CMakeFiles/caffe.dir/util/io.cpp.o CMakeFiles/caffe.dir/util/cudnn.cpp.o CMakeFiles/caffe.dir/util/db.cpp.o CMakeFiles/caffe.dir/util/benchmark.cpp.o CMakeFiles/caffe.dir/device.cpp.o CMakeFiles/caffe.dir/internal_thread.cpp.o CMakeFiles/caffe.dir/data_transformer.cpp.o CMakeFiles/caffe.dir/net.cpp.o CMakeFiles/caffe.dir/solver.cpp.o CMakeFiles/caffe.dir/layer_factory.cpp.o CMakeFiles/caffe.dir/syncedmem.cpp.o CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/log_layer.cpp.o CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/spp_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o CMakeFiles/caffe.dir/layers/exp_layer.cpp.o CMakeFiles/caffe.dir/layers/power_layer.cpp.o CMakeFiles/caffe.dir/layers/relu_layer.cpp.o CMakeFiles/caffe.dir/layers/split_layer.cpp.o CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o CMakeFiles/caffe.dir/layers/silence_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o CMakeFiles/caffe.dir/layers/slice_layer.cpp.o CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o CMakeFiles/caffe.dir/layers/absval_layer.cpp.o CMakeFiles/caffe.dir/layers/loss_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/concat_layer.cpp.o CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o CMakeFiles/caffe.dir/layers/filter_layer.cpp.o CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o CMakeFiles/caffe.dir/layers/conv_layer.cpp.o CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o CMakeFiles/caffe.dir/layers/data_layer.cpp.o CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o  -L/usr/local/cuda/lib64  -L/usr/local/lib ../../lib/libproto.a ../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_core.so.2.4.10 /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 -llapack_atlas -lcblas -latlas /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so -llapack_atlas -lcblas -latlas /usr/local/lib/libopencv_core.so.2.4.10 -Wl,-rpath,/usr/local/cuda/lib64:/usr/local/lib::::::::::::::::::::::::::::::::::::::::::::::::::::::::: 
diff --git a/src/caffe/CMakeFiles/caffe.dir/progress.make b/src/caffe/CMakeFiles/caffe.dir/progress.make
deleted file mode 100644
index d53ba6a8..00000000
--- a/src/caffe/CMakeFiles/caffe.dir/progress.make
+++ /dev/null
@@ -1,118 +0,0 @@
-CMAKE_PROGRESS_1 = 
-CMAKE_PROGRESS_2 = 1
-CMAKE_PROGRESS_3 = 
-CMAKE_PROGRESS_4 = 2
-CMAKE_PROGRESS_5 = 
-CMAKE_PROGRESS_6 = 3
-CMAKE_PROGRESS_7 = 
-CMAKE_PROGRESS_8 = 4
-CMAKE_PROGRESS_9 = 
-CMAKE_PROGRESS_10 = 5
-CMAKE_PROGRESS_11 = 
-CMAKE_PROGRESS_12 = 6
-CMAKE_PROGRESS_13 = 
-CMAKE_PROGRESS_14 = 7
-CMAKE_PROGRESS_15 = 
-CMAKE_PROGRESS_16 = 8
-CMAKE_PROGRESS_17 = 
-CMAKE_PROGRESS_18 = 9
-CMAKE_PROGRESS_19 = 
-CMAKE_PROGRESS_20 = 10
-CMAKE_PROGRESS_21 = 
-CMAKE_PROGRESS_22 = 11
-CMAKE_PROGRESS_23 = 
-CMAKE_PROGRESS_24 = 12
-CMAKE_PROGRESS_25 = 
-CMAKE_PROGRESS_26 = 13
-CMAKE_PROGRESS_27 = 
-CMAKE_PROGRESS_28 = 14
-CMAKE_PROGRESS_29 = 
-CMAKE_PROGRESS_30 = 15
-CMAKE_PROGRESS_31 = 
-CMAKE_PROGRESS_32 = 16
-CMAKE_PROGRESS_33 = 
-CMAKE_PROGRESS_34 = 17
-CMAKE_PROGRESS_35 = 
-CMAKE_PROGRESS_36 = 18
-CMAKE_PROGRESS_37 = 
-CMAKE_PROGRESS_38 = 19
-CMAKE_PROGRESS_39 = 
-CMAKE_PROGRESS_40 = 20
-CMAKE_PROGRESS_41 = 
-CMAKE_PROGRESS_42 = 21
-CMAKE_PROGRESS_43 = 
-CMAKE_PROGRESS_44 = 22
-CMAKE_PROGRESS_45 = 
-CMAKE_PROGRESS_46 = 23
-CMAKE_PROGRESS_47 = 
-CMAKE_PROGRESS_48 = 24
-CMAKE_PROGRESS_49 = 
-CMAKE_PROGRESS_50 = 25
-CMAKE_PROGRESS_51 = 
-CMAKE_PROGRESS_52 = 26
-CMAKE_PROGRESS_53 = 
-CMAKE_PROGRESS_54 = 27
-CMAKE_PROGRESS_55 = 
-CMAKE_PROGRESS_56 = 28
-CMAKE_PROGRESS_57 = 
-CMAKE_PROGRESS_58 = 29
-CMAKE_PROGRESS_59 = 
-CMAKE_PROGRESS_60 = 30
-CMAKE_PROGRESS_61 = 
-CMAKE_PROGRESS_62 = 31
-CMAKE_PROGRESS_63 = 
-CMAKE_PROGRESS_64 = 32
-CMAKE_PROGRESS_65 = 
-CMAKE_PROGRESS_66 = 33
-CMAKE_PROGRESS_67 = 34
-CMAKE_PROGRESS_68 = 
-CMAKE_PROGRESS_69 = 35
-CMAKE_PROGRESS_70 = 
-CMAKE_PROGRESS_71 = 36
-CMAKE_PROGRESS_72 = 
-CMAKE_PROGRESS_73 = 37
-CMAKE_PROGRESS_74 = 
-CMAKE_PROGRESS_75 = 38
-CMAKE_PROGRESS_76 = 
-CMAKE_PROGRESS_77 = 39
-CMAKE_PROGRESS_78 = 
-CMAKE_PROGRESS_79 = 40
-CMAKE_PROGRESS_80 = 
-CMAKE_PROGRESS_81 = 41
-CMAKE_PROGRESS_82 = 
-CMAKE_PROGRESS_83 = 42
-CMAKE_PROGRESS_84 = 
-CMAKE_PROGRESS_85 = 43
-CMAKE_PROGRESS_86 = 
-CMAKE_PROGRESS_87 = 44
-CMAKE_PROGRESS_88 = 
-CMAKE_PROGRESS_89 = 45
-CMAKE_PROGRESS_90 = 
-CMAKE_PROGRESS_91 = 46
-CMAKE_PROGRESS_92 = 
-CMAKE_PROGRESS_93 = 47
-CMAKE_PROGRESS_94 = 
-CMAKE_PROGRESS_95 = 48
-CMAKE_PROGRESS_96 = 
-CMAKE_PROGRESS_97 = 49
-CMAKE_PROGRESS_98 = 
-CMAKE_PROGRESS_99 = 50
-CMAKE_PROGRESS_100 = 
-CMAKE_PROGRESS_101 = 51
-CMAKE_PROGRESS_102 = 
-CMAKE_PROGRESS_103 = 52
-CMAKE_PROGRESS_104 = 
-CMAKE_PROGRESS_105 = 53
-CMAKE_PROGRESS_106 = 
-CMAKE_PROGRESS_107 = 54
-CMAKE_PROGRESS_108 = 
-CMAKE_PROGRESS_109 = 55
-CMAKE_PROGRESS_110 = 
-CMAKE_PROGRESS_111 = 56
-CMAKE_PROGRESS_112 = 
-CMAKE_PROGRESS_113 = 57
-CMAKE_PROGRESS_114 = 
-CMAKE_PROGRESS_115 = 58
-CMAKE_PROGRESS_116 = 
-CMAKE_PROGRESS_117 = 59
-
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
deleted file mode 100644
index 2b3197e9..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/absval_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_absval_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_absval_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
deleted file mode 100644
index 5558d70f..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/base_data_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_base_data_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_base_data_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
deleted file mode 100644
index ae71cc72..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/bnll_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_bnll_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_bnll_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
deleted file mode 100644
index 48e8560a..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/concat_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_concat_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_concat_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
deleted file mode 100644
index c5f6dca9..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/contrastive_loss_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_contrastive_loss_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_contrastive_loss_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
deleted file mode 100644
index 311ad242..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/conv_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_conv_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_conv_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
deleted file mode 100644
index 06210cf1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_conv_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_conv_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_conv_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
deleted file mode 100644
index 8f7960d4..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_pooling_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_pooling_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_pooling_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
deleted file mode 100644
index 308889ee..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_relu_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_relu_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_relu_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
deleted file mode 100644
index d65ebd00..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_sigmoid_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_sigmoid_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_sigmoid_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
deleted file mode 100644
index 806067ce..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_softmax_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_softmax_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_softmax_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
deleted file mode 100644
index 7ace65eb..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/cudnn_tanh_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_cudnn_tanh_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_cudnn_tanh_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
deleted file mode 100644
index bc67ea5b..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/deconv_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_deconv_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_deconv_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
deleted file mode 100644
index 5ff06e9f..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/dropout_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_dropout_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_dropout_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
deleted file mode 100644
index 44e91898..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/eltwise_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_eltwise_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_eltwise_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
deleted file mode 100644
index 98ee3de7..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/euclidean_loss_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_euclidean_loss_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_euclidean_loss_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
deleted file mode 100644
index 2402999e..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/exp_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_exp_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_exp_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
deleted file mode 100644
index 83a032df..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/filter_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_filter_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_filter_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
deleted file mode 100644
index a88ed54d..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_data_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_data_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_data_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
deleted file mode 100644
index 252b9dfd..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/hdf5_output_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_hdf5_output_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_hdf5_output_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
deleted file mode 100644
index 6bda58ec..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/im2col_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_im2col_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_im2col_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
deleted file mode 100644
index eac6680c..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/inner_product_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_inner_product_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_inner_product_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
deleted file mode 100644
index d18371a0..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/log_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_log_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_log_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
deleted file mode 100644
index c3c715f8..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/lrn_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_lrn_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_lrn_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
deleted file mode 100644
index 663f4478..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/mvn_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_mvn_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_mvn_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
deleted file mode 100644
index 866d0f93..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/pooling_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_pooling_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_pooling_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
deleted file mode 100644
index c6c30190..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/power_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_power_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_power_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
deleted file mode 100644
index c64cff0e..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/prelu_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_prelu_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_prelu_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
deleted file mode 100644
index b926deab..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/reduction_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_reduction_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_reduction_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
deleted file mode 100644
index 27fda108..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/relu_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_relu_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_relu_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
deleted file mode 100644
index 63d7ac68..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
deleted file mode 100644
index a7e2268a..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_cross_entropy_loss_layer.cu.o.depend
+++ /dev/null
@@ -1,470 +0,0 @@
-# Generated by: make2cmake.cmake
-SET(CUDA_NVCC_DEPEND
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_cross_entropy_loss_layer.cu"
- "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
- "/opt/clBLAS-private-april8/include/clBLAS.h"
- "/usr/include/H5ACpublic.h"
- "/usr/include/H5Apublic.h"
- "/usr/include/H5Cpublic.h"
- "/usr/include/H5Dpublic.h"
- "/usr/include/H5Epubgen.h"
- "/usr/include/H5Epublic.h"
- "/usr/include/H5FDcore.h"
- "/usr/include/H5FDdirect.h"
- "/usr/include/H5FDfamily.h"
- "/usr/include/H5FDlog.h"
- "/usr/include/H5FDmpi.h"
- "/usr/include/H5FDmpio.h"
- "/usr/include/H5FDmpiposix.h"
- "/usr/include/H5FDmulti.h"
- "/usr/include/H5FDpublic.h"
- "/usr/include/H5FDsec2.h"
- "/usr/include/H5FDstdio.h"
- "/usr/include/H5Fpublic.h"
- "/usr/include/H5Gpublic.h"
- "/usr/include/H5Ipublic.h"
- "/usr/include/H5Lpublic.h"
- "/usr/include/H5MMpublic.h"
- "/usr/include/H5Opublic.h"
- "/usr/include/H5Ppublic.h"
- "/usr/include/H5Rpublic.h"
- "/usr/include/H5Spublic.h"
- "/usr/include/H5Tpublic.h"
- "/usr/include/H5Zpublic.h"
- "/usr/include/H5api_adpt.h"
- "/usr/include/H5pubconf.h"
- "/usr/include/H5public.h"
- "/usr/include/H5version.h"
- "/usr/include/_G_config.h"
- "/usr/include/alloca.h"
- "/usr/include/asm-generic/errno-base.h"
- "/usr/include/asm-generic/errno.h"
- "/usr/include/assert.h"
- "/usr/include/atlas/cblas.h"
- "/usr/include/c++/4.8/algorithm"
- "/usr/include/c++/4.8/backward/auto_ptr.h"
- "/usr/include/c++/4.8/backward/binders.h"
- "/usr/include/c++/4.8/bits/algorithmfwd.h"
- "/usr/include/c++/4.8/bits/allocator.h"
- "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
- "/usr/include/c++/4.8/bits/basic_ios.h"
- "/usr/include/c++/4.8/bits/basic_ios.tcc"
- "/usr/include/c++/4.8/bits/basic_string.h"
- "/usr/include/c++/4.8/bits/basic_string.tcc"
- "/usr/include/c++/4.8/bits/char_traits.h"
- "/usr/include/c++/4.8/bits/codecvt.h"
- "/usr/include/c++/4.8/bits/concept_check.h"
- "/usr/include/c++/4.8/bits/cpp_type_traits.h"
- "/usr/include/c++/4.8/bits/cxxabi_forced.h"
- "/usr/include/c++/4.8/bits/exception_defines.h"
- "/usr/include/c++/4.8/bits/fstream.tcc"
- "/usr/include/c++/4.8/bits/functexcept.h"
- "/usr/include/c++/4.8/bits/ios_base.h"
- "/usr/include/c++/4.8/bits/istream.tcc"
- "/usr/include/c++/4.8/bits/locale_classes.h"
- "/usr/include/c++/4.8/bits/locale_classes.tcc"
- "/usr/include/c++/4.8/bits/locale_facets.h"
- "/usr/include/c++/4.8/bits/locale_facets.tcc"
- "/usr/include/c++/4.8/bits/localefwd.h"
- "/usr/include/c++/4.8/bits/memoryfwd.h"
- "/usr/include/c++/4.8/bits/move.h"
- "/usr/include/c++/4.8/bits/ostream.tcc"
- "/usr/include/c++/4.8/bits/ostream_insert.h"
- "/usr/include/c++/4.8/bits/postypes.h"
- "/usr/include/c++/4.8/bits/range_access.h"
- "/usr/include/c++/4.8/bits/sstream.tcc"
- "/usr/include/c++/4.8/bits/stl_algo.h"
- "/usr/include/c++/4.8/bits/stl_algobase.h"
- "/usr/include/c++/4.8/bits/stl_bvector.h"
- "/usr/include/c++/4.8/bits/stl_construct.h"
- "/usr/include/c++/4.8/bits/stl_function.h"
- "/usr/include/c++/4.8/bits/stl_heap.h"
- "/usr/include/c++/4.8/bits/stl_iterator.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
- "/usr/include/c++/4.8/bits/stl_map.h"
- "/usr/include/c++/4.8/bits/stl_multimap.h"
- "/usr/include/c++/4.8/bits/stl_multiset.h"
- "/usr/include/c++/4.8/bits/stl_pair.h"
- "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
- "/usr/include/c++/4.8/bits/stl_relops.h"
- "/usr/include/c++/4.8/bits/stl_set.h"
- "/usr/include/c++/4.8/bits/stl_tempbuf.h"
- "/usr/include/c++/4.8/bits/stl_tree.h"
- "/usr/include/c++/4.8/bits/stl_uninitialized.h"
- "/usr/include/c++/4.8/bits/stl_vector.h"
- "/usr/include/c++/4.8/bits/stream_iterator.h"
- "/usr/include/c++/4.8/bits/streambuf.tcc"
- "/usr/include/c++/4.8/bits/streambuf_iterator.h"
- "/usr/include/c++/4.8/bits/stringfwd.h"
- "/usr/include/c++/4.8/bits/vector.tcc"
- "/usr/include/c++/4.8/cctype"
- "/usr/include/c++/4.8/cfloat"
- "/usr/include/c++/4.8/climits"
- "/usr/include/c++/4.8/clocale"
- "/usr/include/c++/4.8/cmath"
- "/usr/include/c++/4.8/cstddef"
- "/usr/include/c++/4.8/cstdio"
- "/usr/include/c++/4.8/cstdlib"
- "/usr/include/c++/4.8/cwchar"
- "/usr/include/c++/4.8/cwctype"
- "/usr/include/c++/4.8/cxxabi.h"
- "/usr/include/c++/4.8/debug/debug.h"
- "/usr/include/c++/4.8/exception"
- "/usr/include/c++/4.8/ext/alloc_traits.h"
- "/usr/include/c++/4.8/ext/atomicity.h"
- "/usr/include/c++/4.8/ext/new_allocator.h"
- "/usr/include/c++/4.8/ext/numeric_traits.h"
- "/usr/include/c++/4.8/ext/type_traits.h"
- "/usr/include/c++/4.8/fstream"
- "/usr/include/c++/4.8/functional"
- "/usr/include/c++/4.8/ios"
- "/usr/include/c++/4.8/iosfwd"
- "/usr/include/c++/4.8/iostream"
- "/usr/include/c++/4.8/istream"
- "/usr/include/c++/4.8/iterator"
- "/usr/include/c++/4.8/map"
- "/usr/include/c++/4.8/memory"
- "/usr/include/c++/4.8/new"
- "/usr/include/c++/4.8/ostream"
- "/usr/include/c++/4.8/set"
- "/usr/include/c++/4.8/sstream"
- "/usr/include/c++/4.8/streambuf"
- "/usr/include/c++/4.8/string"
- "/usr/include/c++/4.8/typeinfo"
- "/usr/include/c++/4.8/utility"
- "/usr/include/c++/4.8/vector"
- "/usr/include/ctype.h"
- "/usr/include/endian.h"
- "/usr/include/errno.h"
- "/usr/include/features.h"
- "/usr/include/getopt.h"
- "/usr/include/google/protobuf/descriptor.h"
- "/usr/include/google/protobuf/extension_set.h"
- "/usr/include/google/protobuf/generated_enum_reflection.h"
- "/usr/include/google/protobuf/generated_message_util.h"
- "/usr/include/google/protobuf/message.h"
- "/usr/include/google/protobuf/message_lite.h"
- "/usr/include/google/protobuf/repeated_field.h"
- "/usr/include/google/protobuf/stubs/common.h"
- "/usr/include/google/protobuf/stubs/template_util.h"
- "/usr/include/google/protobuf/stubs/type_traits.h"
- "/usr/include/google/protobuf/unknown_field_set.h"
- "/usr/include/hdf5.h"
- "/usr/include/inttypes.h"
- "/usr/include/libio.h"
- "/usr/include/limits.h"
- "/usr/include/linux/errno.h"
- "/usr/include/linux/limits.h"
- "/usr/include/locale.h"
- "/usr/include/math.h"
- "/usr/include/pthread.h"
- "/usr/include/sched.h"
- "/usr/include/stdc-predef.h"
- "/usr/include/stdint.h"
- "/usr/include/stdio.h"
- "/usr/include/stdlib.h"
- "/usr/include/string.h"
- "/usr/include/time.h"
- "/usr/include/unistd.h"
- "/usr/include/wchar.h"
- "/usr/include/wctype.h"
- "/usr/include/x86_64-linux-gnu/asm/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
- "/usr/include/x86_64-linux-gnu/bits/confname.h"
- "/usr/include/x86_64-linux-gnu/bits/endian.h"
- "/usr/include/x86_64-linux-gnu/bits/environments.h"
- "/usr/include/x86_64-linux-gnu/bits/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
- "/usr/include/x86_64-linux-gnu/bits/inf.h"
- "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/locale.h"
- "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
- "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
- "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
- "/usr/include/x86_64-linux-gnu/bits/nan.h"
- "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
- "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
- "/usr/include/x86_64-linux-gnu/bits/sched.h"
- "/usr/include/x86_64-linux-gnu/bits/select.h"
- "/usr/include/x86_64-linux-gnu/bits/select2.h"
- "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
- "/usr/include/x86_64-linux-gnu/bits/sigset.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
- "/usr/include/x86_64-linux-gnu/bits/string3.h"
- "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
- "/usr/include/x86_64-linux-gnu/bits/time.h"
- "/usr/include/x86_64-linux-gnu/bits/timex.h"
- "/usr/include/x86_64-linux-gnu/bits/types.h"
- "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
- "/usr/include/x86_64-linux-gnu/bits/unistd.h"
- "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
- "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
- "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
- "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
- "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
- "/usr/include/x86_64-linux-gnu/sys/select.h"
- "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
- "/usr/include/x86_64-linux-gnu/sys/types.h"
- "/usr/include/xlocale.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/float.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
- "/usr/local/cuda-6.5/include/CL/cl.h"
- "/usr/local/cuda-6.5/include/CL/cl_ext.h"
- "/usr/local/cuda-6.5/include/CL/cl_platform.h"
- "/usr/local/cuda-6.5/include/builtin_types.h"
- "/usr/local/cuda-6.5/include/channel_descriptor.h"
- "/usr/local/cuda-6.5/include/common_functions.h"
- "/usr/local/cuda-6.5/include/cuComplex.h"
- "/usr/local/cuda-6.5/include/cublas_api.h"
- "/usr/local/cuda-6.5/include/cublas_v2.h"
- "/usr/local/cuda-6.5/include/cuda.h"
- "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_runtime.h"
- "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_surface_types.h"
- "/usr/local/cuda-6.5/include/cuda_texture_types.h"
- "/usr/local/cuda-6.5/include/curand.h"
- "/usr/local/cuda-6.5/include/device_functions.h"
- "/usr/local/cuda-6.5/include/device_launch_parameters.h"
- "/usr/local/cuda-6.5/include/device_types.h"
- "/usr/local/cuda-6.5/include/driver_functions.h"
- "/usr/local/cuda-6.5/include/driver_types.h"
- "/usr/local/cuda-6.5/include/host_config.h"
- "/usr/local/cuda-6.5/include/host_defines.h"
- "/usr/local/cuda-6.5/include/math_functions.h"
- "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
- "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
- "/usr/local/cuda-6.5/include/surface_functions.h"
- "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
- "/usr/local/cuda-6.5/include/surface_types.h"
- "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
- "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
- "/usr/local/cuda-6.5/include/texture_types.h"
- "/usr/local/cuda-6.5/include/vector_functions.h"
- "/usr/local/cuda-6.5/include/vector_types.h"
- "/usr/local/include/boost/assert.hpp"
- "/usr/local/include/boost/checked_delete.hpp"
- "/usr/local/include/boost/config.hpp"
- "/usr/local/include/boost/config/compiler/gcc.hpp"
- "/usr/local/include/boost/config/compiler/nvcc.hpp"
- "/usr/local/include/boost/config/no_tr1/memory.hpp"
- "/usr/local/include/boost/config/no_tr1/utility.hpp"
- "/usr/local/include/boost/config/platform/linux.hpp"
- "/usr/local/include/boost/config/posix_features.hpp"
- "/usr/local/include/boost/config/select_compiler_config.hpp"
- "/usr/local/include/boost/config/select_platform_config.hpp"
- "/usr/local/include/boost/config/select_stdlib_config.hpp"
- "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
- "/usr/local/include/boost/config/suffix.hpp"
- "/usr/local/include/boost/config/user.hpp"
- "/usr/local/include/boost/core/checked_delete.hpp"
- "/usr/local/include/boost/core/demangle.hpp"
- "/usr/local/include/boost/core/typeinfo.hpp"
- "/usr/local/include/boost/current_function.hpp"
- "/usr/local/include/boost/detail/sp_typeinfo.hpp"
- "/usr/local/include/boost/detail/workaround.hpp"
- "/usr/local/include/boost/exception/exception.hpp"
- "/usr/local/include/boost/predef.h"
- "/usr/local/include/boost/predef/architecture.h"
- "/usr/local/include/boost/predef/architecture/alpha.h"
- "/usr/local/include/boost/predef/architecture/arm.h"
- "/usr/local/include/boost/predef/architecture/blackfin.h"
- "/usr/local/include/boost/predef/architecture/convex.h"
- "/usr/local/include/boost/predef/architecture/ia64.h"
- "/usr/local/include/boost/predef/architecture/m68k.h"
- "/usr/local/include/boost/predef/architecture/mips.h"
- "/usr/local/include/boost/predef/architecture/parisc.h"
- "/usr/local/include/boost/predef/architecture/ppc.h"
- "/usr/local/include/boost/predef/architecture/pyramid.h"
- "/usr/local/include/boost/predef/architecture/rs6k.h"
- "/usr/local/include/boost/predef/architecture/sparc.h"
- "/usr/local/include/boost/predef/architecture/superh.h"
- "/usr/local/include/boost/predef/architecture/sys370.h"
- "/usr/local/include/boost/predef/architecture/sys390.h"
- "/usr/local/include/boost/predef/architecture/x86.h"
- "/usr/local/include/boost/predef/architecture/x86/32.h"
- "/usr/local/include/boost/predef/architecture/x86/64.h"
- "/usr/local/include/boost/predef/architecture/z.h"
- "/usr/local/include/boost/predef/compiler.h"
- "/usr/local/include/boost/predef/compiler/borland.h"
- "/usr/local/include/boost/predef/compiler/clang.h"
- "/usr/local/include/boost/predef/compiler/comeau.h"
- "/usr/local/include/boost/predef/compiler/compaq.h"
- "/usr/local/include/boost/predef/compiler/diab.h"
- "/usr/local/include/boost/predef/compiler/digitalmars.h"
- "/usr/local/include/boost/predef/compiler/dignus.h"
- "/usr/local/include/boost/predef/compiler/edg.h"
- "/usr/local/include/boost/predef/compiler/ekopath.h"
- "/usr/local/include/boost/predef/compiler/gcc.h"
- "/usr/local/include/boost/predef/compiler/gcc_xml.h"
- "/usr/local/include/boost/predef/compiler/greenhills.h"
- "/usr/local/include/boost/predef/compiler/hp_acc.h"
- "/usr/local/include/boost/predef/compiler/iar.h"
- "/usr/local/include/boost/predef/compiler/ibm.h"
- "/usr/local/include/boost/predef/compiler/intel.h"
- "/usr/local/include/boost/predef/compiler/kai.h"
- "/usr/local/include/boost/predef/compiler/llvm.h"
- "/usr/local/include/boost/predef/compiler/metaware.h"
- "/usr/local/include/boost/predef/compiler/metrowerks.h"
- "/usr/local/include/boost/predef/compiler/microtec.h"
- "/usr/local/include/boost/predef/compiler/mpw.h"
- "/usr/local/include/boost/predef/compiler/palm.h"
- "/usr/local/include/boost/predef/compiler/pgi.h"
- "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
- "/usr/local/include/boost/predef/compiler/sunpro.h"
- "/usr/local/include/boost/predef/compiler/tendra.h"
- "/usr/local/include/boost/predef/compiler/visualc.h"
- "/usr/local/include/boost/predef/compiler/watcom.h"
- "/usr/local/include/boost/predef/detail/_cassert.h"
- "/usr/local/include/boost/predef/detail/_exception.h"
- "/usr/local/include/boost/predef/detail/comp_detected.h"
- "/usr/local/include/boost/predef/detail/os_detected.h"
- "/usr/local/include/boost/predef/detail/test.h"
- "/usr/local/include/boost/predef/language.h"
- "/usr/local/include/boost/predef/language/objc.h"
- "/usr/local/include/boost/predef/language/stdc.h"
- "/usr/local/include/boost/predef/language/stdcpp.h"
- "/usr/local/include/boost/predef/library.h"
- "/usr/local/include/boost/predef/library/c.h"
- "/usr/local/include/boost/predef/library/c/_prefix.h"
- "/usr/local/include/boost/predef/library/c/gnu.h"
- "/usr/local/include/boost/predef/library/c/uc.h"
- "/usr/local/include/boost/predef/library/c/vms.h"
- "/usr/local/include/boost/predef/library/c/zos.h"
- "/usr/local/include/boost/predef/library/std.h"
- "/usr/local/include/boost/predef/library/std/_prefix.h"
- "/usr/local/include/boost/predef/library/std/cxx.h"
- "/usr/local/include/boost/predef/library/std/dinkumware.h"
- "/usr/local/include/boost/predef/library/std/libcomo.h"
- "/usr/local/include/boost/predef/library/std/modena.h"
- "/usr/local/include/boost/predef/library/std/msl.h"
- "/usr/local/include/boost/predef/library/std/roguewave.h"
- "/usr/local/include/boost/predef/library/std/sgi.h"
- "/usr/local/include/boost/predef/library/std/stdcpp3.h"
- "/usr/local/include/boost/predef/library/std/stlport.h"
- "/usr/local/include/boost/predef/library/std/vacpp.h"
- "/usr/local/include/boost/predef/make.h"
- "/usr/local/include/boost/predef/os.h"
- "/usr/local/include/boost/predef/os/aix.h"
- "/usr/local/include/boost/predef/os/amigaos.h"
- "/usr/local/include/boost/predef/os/android.h"
- "/usr/local/include/boost/predef/os/beos.h"
- "/usr/local/include/boost/predef/os/bsd.h"
- "/usr/local/include/boost/predef/os/bsd/bsdi.h"
- "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
- "/usr/local/include/boost/predef/os/bsd/free.h"
- "/usr/local/include/boost/predef/os/bsd/net.h"
- "/usr/local/include/boost/predef/os/bsd/open.h"
- "/usr/local/include/boost/predef/os/cygwin.h"
- "/usr/local/include/boost/predef/os/hpux.h"
- "/usr/local/include/boost/predef/os/ios.h"
- "/usr/local/include/boost/predef/os/irix.h"
- "/usr/local/include/boost/predef/os/linux.h"
- "/usr/local/include/boost/predef/os/macos.h"
- "/usr/local/include/boost/predef/os/os400.h"
- "/usr/local/include/boost/predef/os/qnxnto.h"
- "/usr/local/include/boost/predef/os/solaris.h"
- "/usr/local/include/boost/predef/os/unix.h"
- "/usr/local/include/boost/predef/os/vms.h"
- "/usr/local/include/boost/predef/os/windows.h"
- "/usr/local/include/boost/predef/other.h"
- "/usr/local/include/boost/predef/other/endian.h"
- "/usr/local/include/boost/predef/platform.h"
- "/usr/local/include/boost/predef/platform/mingw.h"
- "/usr/local/include/boost/predef/platform/windows_desktop.h"
- "/usr/local/include/boost/predef/platform/windows_phone.h"
- "/usr/local/include/boost/predef/platform/windows_runtime.h"
- "/usr/local/include/boost/predef/platform/windows_store.h"
- "/usr/local/include/boost/predef/version_number.h"
- "/usr/local/include/boost/scoped_ptr.hpp"
- "/usr/local/include/boost/shared_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
- "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
- "/usr/local/include/boost/throw_exception.hpp"
- "/usr/local/include/gflags/gflags.h"
- "/usr/local/include/gflags/gflags_declare.h"
- "/usr/local/include/glog/log_severity.h"
- "/usr/local/include/glog/logging.h"
- "/usr/local/include/glog/vlog_is_on.h"
-)
-
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
deleted file mode 100644
index d7dfae88..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_sigmoid_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
deleted file mode 100644
index f9de6105..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_sigmoid_layer.cu.o.depend
+++ /dev/null
@@ -1,468 +0,0 @@
-# Generated by: make2cmake.cmake
-SET(CUDA_NVCC_DEPEND
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/blob.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/data_transformer.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/filler.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/internal_thread.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/layer_factory.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/loss_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/neuron_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/syncedmem.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/db.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/vision_layers.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/sigmoid_layer.cu"
- "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
- "/opt/clBLAS-private-april8/include/clBLAS.h"
- "/usr/include/H5ACpublic.h"
- "/usr/include/H5Apublic.h"
- "/usr/include/H5Cpublic.h"
- "/usr/include/H5Dpublic.h"
- "/usr/include/H5Epubgen.h"
- "/usr/include/H5Epublic.h"
- "/usr/include/H5FDcore.h"
- "/usr/include/H5FDdirect.h"
- "/usr/include/H5FDfamily.h"
- "/usr/include/H5FDlog.h"
- "/usr/include/H5FDmpi.h"
- "/usr/include/H5FDmpio.h"
- "/usr/include/H5FDmpiposix.h"
- "/usr/include/H5FDmulti.h"
- "/usr/include/H5FDpublic.h"
- "/usr/include/H5FDsec2.h"
- "/usr/include/H5FDstdio.h"
- "/usr/include/H5Fpublic.h"
- "/usr/include/H5Gpublic.h"
- "/usr/include/H5Ipublic.h"
- "/usr/include/H5Lpublic.h"
- "/usr/include/H5MMpublic.h"
- "/usr/include/H5Opublic.h"
- "/usr/include/H5Ppublic.h"
- "/usr/include/H5Rpublic.h"
- "/usr/include/H5Spublic.h"
- "/usr/include/H5Tpublic.h"
- "/usr/include/H5Zpublic.h"
- "/usr/include/H5api_adpt.h"
- "/usr/include/H5pubconf.h"
- "/usr/include/H5public.h"
- "/usr/include/H5version.h"
- "/usr/include/_G_config.h"
- "/usr/include/alloca.h"
- "/usr/include/asm-generic/errno-base.h"
- "/usr/include/asm-generic/errno.h"
- "/usr/include/assert.h"
- "/usr/include/atlas/cblas.h"
- "/usr/include/c++/4.8/algorithm"
- "/usr/include/c++/4.8/backward/auto_ptr.h"
- "/usr/include/c++/4.8/backward/binders.h"
- "/usr/include/c++/4.8/bits/algorithmfwd.h"
- "/usr/include/c++/4.8/bits/allocator.h"
- "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
- "/usr/include/c++/4.8/bits/basic_ios.h"
- "/usr/include/c++/4.8/bits/basic_ios.tcc"
- "/usr/include/c++/4.8/bits/basic_string.h"
- "/usr/include/c++/4.8/bits/basic_string.tcc"
- "/usr/include/c++/4.8/bits/char_traits.h"
- "/usr/include/c++/4.8/bits/codecvt.h"
- "/usr/include/c++/4.8/bits/concept_check.h"
- "/usr/include/c++/4.8/bits/cpp_type_traits.h"
- "/usr/include/c++/4.8/bits/cxxabi_forced.h"
- "/usr/include/c++/4.8/bits/exception_defines.h"
- "/usr/include/c++/4.8/bits/fstream.tcc"
- "/usr/include/c++/4.8/bits/functexcept.h"
- "/usr/include/c++/4.8/bits/ios_base.h"
- "/usr/include/c++/4.8/bits/istream.tcc"
- "/usr/include/c++/4.8/bits/locale_classes.h"
- "/usr/include/c++/4.8/bits/locale_classes.tcc"
- "/usr/include/c++/4.8/bits/locale_facets.h"
- "/usr/include/c++/4.8/bits/locale_facets.tcc"
- "/usr/include/c++/4.8/bits/localefwd.h"
- "/usr/include/c++/4.8/bits/memoryfwd.h"
- "/usr/include/c++/4.8/bits/move.h"
- "/usr/include/c++/4.8/bits/ostream.tcc"
- "/usr/include/c++/4.8/bits/ostream_insert.h"
- "/usr/include/c++/4.8/bits/postypes.h"
- "/usr/include/c++/4.8/bits/range_access.h"
- "/usr/include/c++/4.8/bits/sstream.tcc"
- "/usr/include/c++/4.8/bits/stl_algo.h"
- "/usr/include/c++/4.8/bits/stl_algobase.h"
- "/usr/include/c++/4.8/bits/stl_bvector.h"
- "/usr/include/c++/4.8/bits/stl_construct.h"
- "/usr/include/c++/4.8/bits/stl_function.h"
- "/usr/include/c++/4.8/bits/stl_heap.h"
- "/usr/include/c++/4.8/bits/stl_iterator.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
- "/usr/include/c++/4.8/bits/stl_map.h"
- "/usr/include/c++/4.8/bits/stl_multimap.h"
- "/usr/include/c++/4.8/bits/stl_multiset.h"
- "/usr/include/c++/4.8/bits/stl_pair.h"
- "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
- "/usr/include/c++/4.8/bits/stl_relops.h"
- "/usr/include/c++/4.8/bits/stl_set.h"
- "/usr/include/c++/4.8/bits/stl_tempbuf.h"
- "/usr/include/c++/4.8/bits/stl_tree.h"
- "/usr/include/c++/4.8/bits/stl_uninitialized.h"
- "/usr/include/c++/4.8/bits/stl_vector.h"
- "/usr/include/c++/4.8/bits/stream_iterator.h"
- "/usr/include/c++/4.8/bits/streambuf.tcc"
- "/usr/include/c++/4.8/bits/streambuf_iterator.h"
- "/usr/include/c++/4.8/bits/stringfwd.h"
- "/usr/include/c++/4.8/bits/vector.tcc"
- "/usr/include/c++/4.8/cctype"
- "/usr/include/c++/4.8/climits"
- "/usr/include/c++/4.8/clocale"
- "/usr/include/c++/4.8/cmath"
- "/usr/include/c++/4.8/cstddef"
- "/usr/include/c++/4.8/cstdio"
- "/usr/include/c++/4.8/cstdlib"
- "/usr/include/c++/4.8/cwchar"
- "/usr/include/c++/4.8/cwctype"
- "/usr/include/c++/4.8/cxxabi.h"
- "/usr/include/c++/4.8/debug/debug.h"
- "/usr/include/c++/4.8/exception"
- "/usr/include/c++/4.8/ext/alloc_traits.h"
- "/usr/include/c++/4.8/ext/atomicity.h"
- "/usr/include/c++/4.8/ext/new_allocator.h"
- "/usr/include/c++/4.8/ext/numeric_traits.h"
- "/usr/include/c++/4.8/ext/type_traits.h"
- "/usr/include/c++/4.8/fstream"
- "/usr/include/c++/4.8/functional"
- "/usr/include/c++/4.8/ios"
- "/usr/include/c++/4.8/iosfwd"
- "/usr/include/c++/4.8/iostream"
- "/usr/include/c++/4.8/istream"
- "/usr/include/c++/4.8/iterator"
- "/usr/include/c++/4.8/map"
- "/usr/include/c++/4.8/memory"
- "/usr/include/c++/4.8/new"
- "/usr/include/c++/4.8/ostream"
- "/usr/include/c++/4.8/set"
- "/usr/include/c++/4.8/sstream"
- "/usr/include/c++/4.8/streambuf"
- "/usr/include/c++/4.8/string"
- "/usr/include/c++/4.8/typeinfo"
- "/usr/include/c++/4.8/utility"
- "/usr/include/c++/4.8/vector"
- "/usr/include/ctype.h"
- "/usr/include/endian.h"
- "/usr/include/errno.h"
- "/usr/include/features.h"
- "/usr/include/getopt.h"
- "/usr/include/google/protobuf/descriptor.h"
- "/usr/include/google/protobuf/extension_set.h"
- "/usr/include/google/protobuf/generated_enum_reflection.h"
- "/usr/include/google/protobuf/generated_message_util.h"
- "/usr/include/google/protobuf/message.h"
- "/usr/include/google/protobuf/message_lite.h"
- "/usr/include/google/protobuf/repeated_field.h"
- "/usr/include/google/protobuf/stubs/common.h"
- "/usr/include/google/protobuf/stubs/template_util.h"
- "/usr/include/google/protobuf/stubs/type_traits.h"
- "/usr/include/google/protobuf/unknown_field_set.h"
- "/usr/include/hdf5.h"
- "/usr/include/inttypes.h"
- "/usr/include/libio.h"
- "/usr/include/limits.h"
- "/usr/include/linux/errno.h"
- "/usr/include/linux/limits.h"
- "/usr/include/locale.h"
- "/usr/include/math.h"
- "/usr/include/pthread.h"
- "/usr/include/sched.h"
- "/usr/include/stdc-predef.h"
- "/usr/include/stdint.h"
- "/usr/include/stdio.h"
- "/usr/include/stdlib.h"
- "/usr/include/string.h"
- "/usr/include/time.h"
- "/usr/include/unistd.h"
- "/usr/include/wchar.h"
- "/usr/include/wctype.h"
- "/usr/include/x86_64-linux-gnu/asm/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
- "/usr/include/x86_64-linux-gnu/bits/confname.h"
- "/usr/include/x86_64-linux-gnu/bits/endian.h"
- "/usr/include/x86_64-linux-gnu/bits/environments.h"
- "/usr/include/x86_64-linux-gnu/bits/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
- "/usr/include/x86_64-linux-gnu/bits/inf.h"
- "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/locale.h"
- "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
- "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
- "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
- "/usr/include/x86_64-linux-gnu/bits/nan.h"
- "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
- "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
- "/usr/include/x86_64-linux-gnu/bits/sched.h"
- "/usr/include/x86_64-linux-gnu/bits/select.h"
- "/usr/include/x86_64-linux-gnu/bits/select2.h"
- "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
- "/usr/include/x86_64-linux-gnu/bits/sigset.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
- "/usr/include/x86_64-linux-gnu/bits/string3.h"
- "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
- "/usr/include/x86_64-linux-gnu/bits/time.h"
- "/usr/include/x86_64-linux-gnu/bits/timex.h"
- "/usr/include/x86_64-linux-gnu/bits/types.h"
- "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
- "/usr/include/x86_64-linux-gnu/bits/unistd.h"
- "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
- "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
- "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
- "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
- "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
- "/usr/include/x86_64-linux-gnu/sys/select.h"
- "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
- "/usr/include/x86_64-linux-gnu/sys/types.h"
- "/usr/include/xlocale.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
- "/usr/local/cuda-6.5/include/CL/cl.h"
- "/usr/local/cuda-6.5/include/CL/cl_ext.h"
- "/usr/local/cuda-6.5/include/CL/cl_platform.h"
- "/usr/local/cuda-6.5/include/builtin_types.h"
- "/usr/local/cuda-6.5/include/channel_descriptor.h"
- "/usr/local/cuda-6.5/include/common_functions.h"
- "/usr/local/cuda-6.5/include/cuComplex.h"
- "/usr/local/cuda-6.5/include/cublas_api.h"
- "/usr/local/cuda-6.5/include/cublas_v2.h"
- "/usr/local/cuda-6.5/include/cuda.h"
- "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_runtime.h"
- "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_surface_types.h"
- "/usr/local/cuda-6.5/include/cuda_texture_types.h"
- "/usr/local/cuda-6.5/include/curand.h"
- "/usr/local/cuda-6.5/include/device_functions.h"
- "/usr/local/cuda-6.5/include/device_launch_parameters.h"
- "/usr/local/cuda-6.5/include/device_types.h"
- "/usr/local/cuda-6.5/include/driver_functions.h"
- "/usr/local/cuda-6.5/include/driver_types.h"
- "/usr/local/cuda-6.5/include/host_config.h"
- "/usr/local/cuda-6.5/include/host_defines.h"
- "/usr/local/cuda-6.5/include/math_functions.h"
- "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
- "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
- "/usr/local/cuda-6.5/include/surface_functions.h"
- "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
- "/usr/local/cuda-6.5/include/surface_types.h"
- "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
- "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
- "/usr/local/cuda-6.5/include/texture_types.h"
- "/usr/local/cuda-6.5/include/vector_functions.h"
- "/usr/local/cuda-6.5/include/vector_types.h"
- "/usr/local/include/boost/assert.hpp"
- "/usr/local/include/boost/checked_delete.hpp"
- "/usr/local/include/boost/config.hpp"
- "/usr/local/include/boost/config/compiler/gcc.hpp"
- "/usr/local/include/boost/config/compiler/nvcc.hpp"
- "/usr/local/include/boost/config/no_tr1/memory.hpp"
- "/usr/local/include/boost/config/no_tr1/utility.hpp"
- "/usr/local/include/boost/config/platform/linux.hpp"
- "/usr/local/include/boost/config/posix_features.hpp"
- "/usr/local/include/boost/config/select_compiler_config.hpp"
- "/usr/local/include/boost/config/select_platform_config.hpp"
- "/usr/local/include/boost/config/select_stdlib_config.hpp"
- "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
- "/usr/local/include/boost/config/suffix.hpp"
- "/usr/local/include/boost/config/user.hpp"
- "/usr/local/include/boost/core/checked_delete.hpp"
- "/usr/local/include/boost/core/demangle.hpp"
- "/usr/local/include/boost/core/typeinfo.hpp"
- "/usr/local/include/boost/current_function.hpp"
- "/usr/local/include/boost/detail/sp_typeinfo.hpp"
- "/usr/local/include/boost/detail/workaround.hpp"
- "/usr/local/include/boost/exception/exception.hpp"
- "/usr/local/include/boost/predef.h"
- "/usr/local/include/boost/predef/architecture.h"
- "/usr/local/include/boost/predef/architecture/alpha.h"
- "/usr/local/include/boost/predef/architecture/arm.h"
- "/usr/local/include/boost/predef/architecture/blackfin.h"
- "/usr/local/include/boost/predef/architecture/convex.h"
- "/usr/local/include/boost/predef/architecture/ia64.h"
- "/usr/local/include/boost/predef/architecture/m68k.h"
- "/usr/local/include/boost/predef/architecture/mips.h"
- "/usr/local/include/boost/predef/architecture/parisc.h"
- "/usr/local/include/boost/predef/architecture/ppc.h"
- "/usr/local/include/boost/predef/architecture/pyramid.h"
- "/usr/local/include/boost/predef/architecture/rs6k.h"
- "/usr/local/include/boost/predef/architecture/sparc.h"
- "/usr/local/include/boost/predef/architecture/superh.h"
- "/usr/local/include/boost/predef/architecture/sys370.h"
- "/usr/local/include/boost/predef/architecture/sys390.h"
- "/usr/local/include/boost/predef/architecture/x86.h"
- "/usr/local/include/boost/predef/architecture/x86/32.h"
- "/usr/local/include/boost/predef/architecture/x86/64.h"
- "/usr/local/include/boost/predef/architecture/z.h"
- "/usr/local/include/boost/predef/compiler.h"
- "/usr/local/include/boost/predef/compiler/borland.h"
- "/usr/local/include/boost/predef/compiler/clang.h"
- "/usr/local/include/boost/predef/compiler/comeau.h"
- "/usr/local/include/boost/predef/compiler/compaq.h"
- "/usr/local/include/boost/predef/compiler/diab.h"
- "/usr/local/include/boost/predef/compiler/digitalmars.h"
- "/usr/local/include/boost/predef/compiler/dignus.h"
- "/usr/local/include/boost/predef/compiler/edg.h"
- "/usr/local/include/boost/predef/compiler/ekopath.h"
- "/usr/local/include/boost/predef/compiler/gcc.h"
- "/usr/local/include/boost/predef/compiler/gcc_xml.h"
- "/usr/local/include/boost/predef/compiler/greenhills.h"
- "/usr/local/include/boost/predef/compiler/hp_acc.h"
- "/usr/local/include/boost/predef/compiler/iar.h"
- "/usr/local/include/boost/predef/compiler/ibm.h"
- "/usr/local/include/boost/predef/compiler/intel.h"
- "/usr/local/include/boost/predef/compiler/kai.h"
- "/usr/local/include/boost/predef/compiler/llvm.h"
- "/usr/local/include/boost/predef/compiler/metaware.h"
- "/usr/local/include/boost/predef/compiler/metrowerks.h"
- "/usr/local/include/boost/predef/compiler/microtec.h"
- "/usr/local/include/boost/predef/compiler/mpw.h"
- "/usr/local/include/boost/predef/compiler/palm.h"
- "/usr/local/include/boost/predef/compiler/pgi.h"
- "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
- "/usr/local/include/boost/predef/compiler/sunpro.h"
- "/usr/local/include/boost/predef/compiler/tendra.h"
- "/usr/local/include/boost/predef/compiler/visualc.h"
- "/usr/local/include/boost/predef/compiler/watcom.h"
- "/usr/local/include/boost/predef/detail/_cassert.h"
- "/usr/local/include/boost/predef/detail/_exception.h"
- "/usr/local/include/boost/predef/detail/comp_detected.h"
- "/usr/local/include/boost/predef/detail/os_detected.h"
- "/usr/local/include/boost/predef/detail/test.h"
- "/usr/local/include/boost/predef/language.h"
- "/usr/local/include/boost/predef/language/objc.h"
- "/usr/local/include/boost/predef/language/stdc.h"
- "/usr/local/include/boost/predef/language/stdcpp.h"
- "/usr/local/include/boost/predef/library.h"
- "/usr/local/include/boost/predef/library/c.h"
- "/usr/local/include/boost/predef/library/c/_prefix.h"
- "/usr/local/include/boost/predef/library/c/gnu.h"
- "/usr/local/include/boost/predef/library/c/uc.h"
- "/usr/local/include/boost/predef/library/c/vms.h"
- "/usr/local/include/boost/predef/library/c/zos.h"
- "/usr/local/include/boost/predef/library/std.h"
- "/usr/local/include/boost/predef/library/std/_prefix.h"
- "/usr/local/include/boost/predef/library/std/cxx.h"
- "/usr/local/include/boost/predef/library/std/dinkumware.h"
- "/usr/local/include/boost/predef/library/std/libcomo.h"
- "/usr/local/include/boost/predef/library/std/modena.h"
- "/usr/local/include/boost/predef/library/std/msl.h"
- "/usr/local/include/boost/predef/library/std/roguewave.h"
- "/usr/local/include/boost/predef/library/std/sgi.h"
- "/usr/local/include/boost/predef/library/std/stdcpp3.h"
- "/usr/local/include/boost/predef/library/std/stlport.h"
- "/usr/local/include/boost/predef/library/std/vacpp.h"
- "/usr/local/include/boost/predef/make.h"
- "/usr/local/include/boost/predef/os.h"
- "/usr/local/include/boost/predef/os/aix.h"
- "/usr/local/include/boost/predef/os/amigaos.h"
- "/usr/local/include/boost/predef/os/android.h"
- "/usr/local/include/boost/predef/os/beos.h"
- "/usr/local/include/boost/predef/os/bsd.h"
- "/usr/local/include/boost/predef/os/bsd/bsdi.h"
- "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
- "/usr/local/include/boost/predef/os/bsd/free.h"
- "/usr/local/include/boost/predef/os/bsd/net.h"
- "/usr/local/include/boost/predef/os/bsd/open.h"
- "/usr/local/include/boost/predef/os/cygwin.h"
- "/usr/local/include/boost/predef/os/hpux.h"
- "/usr/local/include/boost/predef/os/ios.h"
- "/usr/local/include/boost/predef/os/irix.h"
- "/usr/local/include/boost/predef/os/linux.h"
- "/usr/local/include/boost/predef/os/macos.h"
- "/usr/local/include/boost/predef/os/os400.h"
- "/usr/local/include/boost/predef/os/qnxnto.h"
- "/usr/local/include/boost/predef/os/solaris.h"
- "/usr/local/include/boost/predef/os/unix.h"
- "/usr/local/include/boost/predef/os/vms.h"
- "/usr/local/include/boost/predef/os/windows.h"
- "/usr/local/include/boost/predef/other.h"
- "/usr/local/include/boost/predef/other/endian.h"
- "/usr/local/include/boost/predef/platform.h"
- "/usr/local/include/boost/predef/platform/mingw.h"
- "/usr/local/include/boost/predef/platform/windows_desktop.h"
- "/usr/local/include/boost/predef/platform/windows_phone.h"
- "/usr/local/include/boost/predef/platform/windows_runtime.h"
- "/usr/local/include/boost/predef/platform/windows_store.h"
- "/usr/local/include/boost/predef/version_number.h"
- "/usr/local/include/boost/scoped_ptr.hpp"
- "/usr/local/include/boost/shared_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
- "/usr/local/include/boost/smart_ptr/scoped_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
- "/usr/local/include/boost/throw_exception.hpp"
- "/usr/local/include/gflags/gflags.h"
- "/usr/local/include/gflags/gflags_declare.h"
- "/usr/local/include/glog/log_severity.h"
- "/usr/local/include/glog/logging.h"
- "/usr/local/include/glog/vlog_is_on.h"
-)
-
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
deleted file mode 100644
index dd2453ae..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/silence_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_silence_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_silence_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
deleted file mode 100644
index 990e0622..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/slice_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_slice_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_slice_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
deleted file mode 100644
index ebf29ea2..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
deleted file mode 100644
index 6260b6e0..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/softmax_loss_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_softmax_loss_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_softmax_loss_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
deleted file mode 100644
index ad49afe7..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/split_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_split_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_split_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
deleted file mode 100644
index 71fc8fdb..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/tanh_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_tanh_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_tanh_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
deleted file mode 100644
index 4e18059a..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/layers/cufiles/threshold_layer.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/./cuda_compile_generated_threshold_layer.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/layers/cufiles/cuda_compile_generated_threshold_layer.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
deleted file mode 100644
index 8de5e27c..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_im2col.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
deleted file mode 100644
index 36db02fe..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_im2col.cu.o.depend
+++ /dev/null
@@ -1,404 +0,0 @@
-# Generated by: make2cmake.cmake
-SET(CUDA_NVCC_DEPEND
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/im2col.cu"
- "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
- "/opt/clBLAS-private-april8/include/clBLAS.h"
- "/usr/include/_G_config.h"
- "/usr/include/alloca.h"
- "/usr/include/asm-generic/errno-base.h"
- "/usr/include/asm-generic/errno.h"
- "/usr/include/assert.h"
- "/usr/include/c++/4.8/algorithm"
- "/usr/include/c++/4.8/backward/auto_ptr.h"
- "/usr/include/c++/4.8/backward/binders.h"
- "/usr/include/c++/4.8/bits/algorithmfwd.h"
- "/usr/include/c++/4.8/bits/allocator.h"
- "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
- "/usr/include/c++/4.8/bits/basic_ios.h"
- "/usr/include/c++/4.8/bits/basic_ios.tcc"
- "/usr/include/c++/4.8/bits/basic_string.h"
- "/usr/include/c++/4.8/bits/basic_string.tcc"
- "/usr/include/c++/4.8/bits/char_traits.h"
- "/usr/include/c++/4.8/bits/codecvt.h"
- "/usr/include/c++/4.8/bits/concept_check.h"
- "/usr/include/c++/4.8/bits/cpp_type_traits.h"
- "/usr/include/c++/4.8/bits/cxxabi_forced.h"
- "/usr/include/c++/4.8/bits/exception_defines.h"
- "/usr/include/c++/4.8/bits/fstream.tcc"
- "/usr/include/c++/4.8/bits/functexcept.h"
- "/usr/include/c++/4.8/bits/ios_base.h"
- "/usr/include/c++/4.8/bits/istream.tcc"
- "/usr/include/c++/4.8/bits/locale_classes.h"
- "/usr/include/c++/4.8/bits/locale_classes.tcc"
- "/usr/include/c++/4.8/bits/locale_facets.h"
- "/usr/include/c++/4.8/bits/locale_facets.tcc"
- "/usr/include/c++/4.8/bits/localefwd.h"
- "/usr/include/c++/4.8/bits/memoryfwd.h"
- "/usr/include/c++/4.8/bits/move.h"
- "/usr/include/c++/4.8/bits/ostream.tcc"
- "/usr/include/c++/4.8/bits/ostream_insert.h"
- "/usr/include/c++/4.8/bits/postypes.h"
- "/usr/include/c++/4.8/bits/range_access.h"
- "/usr/include/c++/4.8/bits/sstream.tcc"
- "/usr/include/c++/4.8/bits/stl_algo.h"
- "/usr/include/c++/4.8/bits/stl_algobase.h"
- "/usr/include/c++/4.8/bits/stl_bvector.h"
- "/usr/include/c++/4.8/bits/stl_construct.h"
- "/usr/include/c++/4.8/bits/stl_function.h"
- "/usr/include/c++/4.8/bits/stl_heap.h"
- "/usr/include/c++/4.8/bits/stl_iterator.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
- "/usr/include/c++/4.8/bits/stl_map.h"
- "/usr/include/c++/4.8/bits/stl_multimap.h"
- "/usr/include/c++/4.8/bits/stl_multiset.h"
- "/usr/include/c++/4.8/bits/stl_pair.h"
- "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
- "/usr/include/c++/4.8/bits/stl_relops.h"
- "/usr/include/c++/4.8/bits/stl_set.h"
- "/usr/include/c++/4.8/bits/stl_tempbuf.h"
- "/usr/include/c++/4.8/bits/stl_tree.h"
- "/usr/include/c++/4.8/bits/stl_uninitialized.h"
- "/usr/include/c++/4.8/bits/stl_vector.h"
- "/usr/include/c++/4.8/bits/streambuf.tcc"
- "/usr/include/c++/4.8/bits/streambuf_iterator.h"
- "/usr/include/c++/4.8/bits/stringfwd.h"
- "/usr/include/c++/4.8/bits/vector.tcc"
- "/usr/include/c++/4.8/cctype"
- "/usr/include/c++/4.8/climits"
- "/usr/include/c++/4.8/clocale"
- "/usr/include/c++/4.8/cmath"
- "/usr/include/c++/4.8/cstddef"
- "/usr/include/c++/4.8/cstdio"
- "/usr/include/c++/4.8/cstdlib"
- "/usr/include/c++/4.8/cstring"
- "/usr/include/c++/4.8/cwchar"
- "/usr/include/c++/4.8/cwctype"
- "/usr/include/c++/4.8/cxxabi.h"
- "/usr/include/c++/4.8/debug/debug.h"
- "/usr/include/c++/4.8/exception"
- "/usr/include/c++/4.8/ext/alloc_traits.h"
- "/usr/include/c++/4.8/ext/atomicity.h"
- "/usr/include/c++/4.8/ext/new_allocator.h"
- "/usr/include/c++/4.8/ext/numeric_traits.h"
- "/usr/include/c++/4.8/ext/type_traits.h"
- "/usr/include/c++/4.8/fstream"
- "/usr/include/c++/4.8/functional"
- "/usr/include/c++/4.8/ios"
- "/usr/include/c++/4.8/iosfwd"
- "/usr/include/c++/4.8/iostream"
- "/usr/include/c++/4.8/istream"
- "/usr/include/c++/4.8/map"
- "/usr/include/c++/4.8/memory"
- "/usr/include/c++/4.8/new"
- "/usr/include/c++/4.8/ostream"
- "/usr/include/c++/4.8/set"
- "/usr/include/c++/4.8/sstream"
- "/usr/include/c++/4.8/streambuf"
- "/usr/include/c++/4.8/string"
- "/usr/include/c++/4.8/typeinfo"
- "/usr/include/c++/4.8/utility"
- "/usr/include/c++/4.8/vector"
- "/usr/include/ctype.h"
- "/usr/include/endian.h"
- "/usr/include/errno.h"
- "/usr/include/features.h"
- "/usr/include/getopt.h"
- "/usr/include/inttypes.h"
- "/usr/include/libio.h"
- "/usr/include/limits.h"
- "/usr/include/linux/errno.h"
- "/usr/include/linux/limits.h"
- "/usr/include/locale.h"
- "/usr/include/math.h"
- "/usr/include/pthread.h"
- "/usr/include/sched.h"
- "/usr/include/stdc-predef.h"
- "/usr/include/stdint.h"
- "/usr/include/stdio.h"
- "/usr/include/stdlib.h"
- "/usr/include/string.h"
- "/usr/include/time.h"
- "/usr/include/unistd.h"
- "/usr/include/wchar.h"
- "/usr/include/wctype.h"
- "/usr/include/x86_64-linux-gnu/asm/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
- "/usr/include/x86_64-linux-gnu/bits/confname.h"
- "/usr/include/x86_64-linux-gnu/bits/endian.h"
- "/usr/include/x86_64-linux-gnu/bits/environments.h"
- "/usr/include/x86_64-linux-gnu/bits/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
- "/usr/include/x86_64-linux-gnu/bits/inf.h"
- "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/locale.h"
- "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
- "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
- "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
- "/usr/include/x86_64-linux-gnu/bits/nan.h"
- "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
- "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
- "/usr/include/x86_64-linux-gnu/bits/sched.h"
- "/usr/include/x86_64-linux-gnu/bits/select.h"
- "/usr/include/x86_64-linux-gnu/bits/select2.h"
- "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
- "/usr/include/x86_64-linux-gnu/bits/sigset.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
- "/usr/include/x86_64-linux-gnu/bits/string3.h"
- "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
- "/usr/include/x86_64-linux-gnu/bits/time.h"
- "/usr/include/x86_64-linux-gnu/bits/timex.h"
- "/usr/include/x86_64-linux-gnu/bits/types.h"
- "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
- "/usr/include/x86_64-linux-gnu/bits/unistd.h"
- "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
- "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
- "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
- "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
- "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
- "/usr/include/x86_64-linux-gnu/sys/select.h"
- "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
- "/usr/include/x86_64-linux-gnu/sys/types.h"
- "/usr/include/xlocale.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
- "/usr/local/cuda-6.5/include/CL/cl.h"
- "/usr/local/cuda-6.5/include/CL/cl_ext.h"
- "/usr/local/cuda-6.5/include/CL/cl_platform.h"
- "/usr/local/cuda-6.5/include/builtin_types.h"
- "/usr/local/cuda-6.5/include/channel_descriptor.h"
- "/usr/local/cuda-6.5/include/common_functions.h"
- "/usr/local/cuda-6.5/include/cuComplex.h"
- "/usr/local/cuda-6.5/include/cublas_api.h"
- "/usr/local/cuda-6.5/include/cublas_v2.h"
- "/usr/local/cuda-6.5/include/cuda.h"
- "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_runtime.h"
- "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_surface_types.h"
- "/usr/local/cuda-6.5/include/cuda_texture_types.h"
- "/usr/local/cuda-6.5/include/curand.h"
- "/usr/local/cuda-6.5/include/device_functions.h"
- "/usr/local/cuda-6.5/include/device_launch_parameters.h"
- "/usr/local/cuda-6.5/include/device_types.h"
- "/usr/local/cuda-6.5/include/driver_functions.h"
- "/usr/local/cuda-6.5/include/driver_types.h"
- "/usr/local/cuda-6.5/include/host_config.h"
- "/usr/local/cuda-6.5/include/host_defines.h"
- "/usr/local/cuda-6.5/include/math_functions.h"
- "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
- "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
- "/usr/local/cuda-6.5/include/surface_functions.h"
- "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
- "/usr/local/cuda-6.5/include/surface_types.h"
- "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
- "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
- "/usr/local/cuda-6.5/include/texture_types.h"
- "/usr/local/cuda-6.5/include/vector_functions.h"
- "/usr/local/cuda-6.5/include/vector_types.h"
- "/usr/local/include/boost/assert.hpp"
- "/usr/local/include/boost/checked_delete.hpp"
- "/usr/local/include/boost/config.hpp"
- "/usr/local/include/boost/config/compiler/gcc.hpp"
- "/usr/local/include/boost/config/compiler/nvcc.hpp"
- "/usr/local/include/boost/config/no_tr1/memory.hpp"
- "/usr/local/include/boost/config/no_tr1/utility.hpp"
- "/usr/local/include/boost/config/platform/linux.hpp"
- "/usr/local/include/boost/config/posix_features.hpp"
- "/usr/local/include/boost/config/select_compiler_config.hpp"
- "/usr/local/include/boost/config/select_platform_config.hpp"
- "/usr/local/include/boost/config/select_stdlib_config.hpp"
- "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
- "/usr/local/include/boost/config/suffix.hpp"
- "/usr/local/include/boost/config/user.hpp"
- "/usr/local/include/boost/core/checked_delete.hpp"
- "/usr/local/include/boost/core/demangle.hpp"
- "/usr/local/include/boost/core/typeinfo.hpp"
- "/usr/local/include/boost/current_function.hpp"
- "/usr/local/include/boost/detail/sp_typeinfo.hpp"
- "/usr/local/include/boost/detail/workaround.hpp"
- "/usr/local/include/boost/exception/exception.hpp"
- "/usr/local/include/boost/predef.h"
- "/usr/local/include/boost/predef/architecture.h"
- "/usr/local/include/boost/predef/architecture/alpha.h"
- "/usr/local/include/boost/predef/architecture/arm.h"
- "/usr/local/include/boost/predef/architecture/blackfin.h"
- "/usr/local/include/boost/predef/architecture/convex.h"
- "/usr/local/include/boost/predef/architecture/ia64.h"
- "/usr/local/include/boost/predef/architecture/m68k.h"
- "/usr/local/include/boost/predef/architecture/mips.h"
- "/usr/local/include/boost/predef/architecture/parisc.h"
- "/usr/local/include/boost/predef/architecture/ppc.h"
- "/usr/local/include/boost/predef/architecture/pyramid.h"
- "/usr/local/include/boost/predef/architecture/rs6k.h"
- "/usr/local/include/boost/predef/architecture/sparc.h"
- "/usr/local/include/boost/predef/architecture/superh.h"
- "/usr/local/include/boost/predef/architecture/sys370.h"
- "/usr/local/include/boost/predef/architecture/sys390.h"
- "/usr/local/include/boost/predef/architecture/x86.h"
- "/usr/local/include/boost/predef/architecture/x86/32.h"
- "/usr/local/include/boost/predef/architecture/x86/64.h"
- "/usr/local/include/boost/predef/architecture/z.h"
- "/usr/local/include/boost/predef/compiler.h"
- "/usr/local/include/boost/predef/compiler/borland.h"
- "/usr/local/include/boost/predef/compiler/clang.h"
- "/usr/local/include/boost/predef/compiler/comeau.h"
- "/usr/local/include/boost/predef/compiler/compaq.h"
- "/usr/local/include/boost/predef/compiler/diab.h"
- "/usr/local/include/boost/predef/compiler/digitalmars.h"
- "/usr/local/include/boost/predef/compiler/dignus.h"
- "/usr/local/include/boost/predef/compiler/edg.h"
- "/usr/local/include/boost/predef/compiler/ekopath.h"
- "/usr/local/include/boost/predef/compiler/gcc.h"
- "/usr/local/include/boost/predef/compiler/gcc_xml.h"
- "/usr/local/include/boost/predef/compiler/greenhills.h"
- "/usr/local/include/boost/predef/compiler/hp_acc.h"
- "/usr/local/include/boost/predef/compiler/iar.h"
- "/usr/local/include/boost/predef/compiler/ibm.h"
- "/usr/local/include/boost/predef/compiler/intel.h"
- "/usr/local/include/boost/predef/compiler/kai.h"
- "/usr/local/include/boost/predef/compiler/llvm.h"
- "/usr/local/include/boost/predef/compiler/metaware.h"
- "/usr/local/include/boost/predef/compiler/metrowerks.h"
- "/usr/local/include/boost/predef/compiler/microtec.h"
- "/usr/local/include/boost/predef/compiler/mpw.h"
- "/usr/local/include/boost/predef/compiler/palm.h"
- "/usr/local/include/boost/predef/compiler/pgi.h"
- "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
- "/usr/local/include/boost/predef/compiler/sunpro.h"
- "/usr/local/include/boost/predef/compiler/tendra.h"
- "/usr/local/include/boost/predef/compiler/visualc.h"
- "/usr/local/include/boost/predef/compiler/watcom.h"
- "/usr/local/include/boost/predef/detail/_cassert.h"
- "/usr/local/include/boost/predef/detail/_exception.h"
- "/usr/local/include/boost/predef/detail/comp_detected.h"
- "/usr/local/include/boost/predef/detail/os_detected.h"
- "/usr/local/include/boost/predef/detail/test.h"
- "/usr/local/include/boost/predef/language.h"
- "/usr/local/include/boost/predef/language/objc.h"
- "/usr/local/include/boost/predef/language/stdc.h"
- "/usr/local/include/boost/predef/language/stdcpp.h"
- "/usr/local/include/boost/predef/library.h"
- "/usr/local/include/boost/predef/library/c.h"
- "/usr/local/include/boost/predef/library/c/_prefix.h"
- "/usr/local/include/boost/predef/library/c/gnu.h"
- "/usr/local/include/boost/predef/library/c/uc.h"
- "/usr/local/include/boost/predef/library/c/vms.h"
- "/usr/local/include/boost/predef/library/c/zos.h"
- "/usr/local/include/boost/predef/library/std.h"
- "/usr/local/include/boost/predef/library/std/_prefix.h"
- "/usr/local/include/boost/predef/library/std/cxx.h"
- "/usr/local/include/boost/predef/library/std/dinkumware.h"
- "/usr/local/include/boost/predef/library/std/libcomo.h"
- "/usr/local/include/boost/predef/library/std/modena.h"
- "/usr/local/include/boost/predef/library/std/msl.h"
- "/usr/local/include/boost/predef/library/std/roguewave.h"
- "/usr/local/include/boost/predef/library/std/sgi.h"
- "/usr/local/include/boost/predef/library/std/stdcpp3.h"
- "/usr/local/include/boost/predef/library/std/stlport.h"
- "/usr/local/include/boost/predef/library/std/vacpp.h"
- "/usr/local/include/boost/predef/make.h"
- "/usr/local/include/boost/predef/os.h"
- "/usr/local/include/boost/predef/os/aix.h"
- "/usr/local/include/boost/predef/os/amigaos.h"
- "/usr/local/include/boost/predef/os/android.h"
- "/usr/local/include/boost/predef/os/beos.h"
- "/usr/local/include/boost/predef/os/bsd.h"
- "/usr/local/include/boost/predef/os/bsd/bsdi.h"
- "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
- "/usr/local/include/boost/predef/os/bsd/free.h"
- "/usr/local/include/boost/predef/os/bsd/net.h"
- "/usr/local/include/boost/predef/os/bsd/open.h"
- "/usr/local/include/boost/predef/os/cygwin.h"
- "/usr/local/include/boost/predef/os/hpux.h"
- "/usr/local/include/boost/predef/os/ios.h"
- "/usr/local/include/boost/predef/os/irix.h"
- "/usr/local/include/boost/predef/os/linux.h"
- "/usr/local/include/boost/predef/os/macos.h"
- "/usr/local/include/boost/predef/os/os400.h"
- "/usr/local/include/boost/predef/os/qnxnto.h"
- "/usr/local/include/boost/predef/os/solaris.h"
- "/usr/local/include/boost/predef/os/unix.h"
- "/usr/local/include/boost/predef/os/vms.h"
- "/usr/local/include/boost/predef/os/windows.h"
- "/usr/local/include/boost/predef/other.h"
- "/usr/local/include/boost/predef/other/endian.h"
- "/usr/local/include/boost/predef/platform.h"
- "/usr/local/include/boost/predef/platform/mingw.h"
- "/usr/local/include/boost/predef/platform/windows_desktop.h"
- "/usr/local/include/boost/predef/platform/windows_phone.h"
- "/usr/local/include/boost/predef/platform/windows_runtime.h"
- "/usr/local/include/boost/predef/platform/windows_store.h"
- "/usr/local/include/boost/predef/version_number.h"
- "/usr/local/include/boost/shared_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
- "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
- "/usr/local/include/boost/throw_exception.hpp"
- "/usr/local/include/gflags/gflags.h"
- "/usr/local/include/gflags/gflags_declare.h"
- "/usr/local/include/glog/log_severity.h"
- "/usr/local/include/glog/logging.h"
- "/usr/local/include/glog/vlog_is_on.h"
-)
-
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
deleted file mode 100644
index 0bd0d4e9..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/cuda_compile.dir/util/./cuda_compile_generated_math_functions.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend b/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
deleted file mode 100644
index 2dfb589a..00000000
--- a/src/caffe/CMakeFiles/cuda_compile.dir/util/cuda_compile_generated_math_functions.cu.o.depend
+++ /dev/null
@@ -1,744 +0,0 @@
-# Generated by: make2cmake.cmake
-SET(CUDA_NVCC_DEPEND
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/common.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/device.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/device_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/im2col.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/math_functions.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/mkl_alternate.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_util.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/util/ocl_wrapper.hpp"
- "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/util/math_functions.cu"
- "/opt/clBLAS-private-april8/include/clBLAS-complex.h"
- "/opt/clBLAS-private-april8/include/clBLAS.h"
- "/usr/include/_G_config.h"
- "/usr/include/alloca.h"
- "/usr/include/asm-generic/errno-base.h"
- "/usr/include/asm-generic/errno.h"
- "/usr/include/assert.h"
- "/usr/include/atlas/cblas.h"
- "/usr/include/c++/4.8/algorithm"
- "/usr/include/c++/4.8/backward/auto_ptr.h"
- "/usr/include/c++/4.8/backward/binders.h"
- "/usr/include/c++/4.8/bits/algorithmfwd.h"
- "/usr/include/c++/4.8/bits/allocator.h"
- "/usr/include/c++/4.8/bits/atomic_lockfree_defines.h"
- "/usr/include/c++/4.8/bits/basic_ios.h"
- "/usr/include/c++/4.8/bits/basic_ios.tcc"
- "/usr/include/c++/4.8/bits/basic_string.h"
- "/usr/include/c++/4.8/bits/basic_string.tcc"
- "/usr/include/c++/4.8/bits/char_traits.h"
- "/usr/include/c++/4.8/bits/codecvt.h"
- "/usr/include/c++/4.8/bits/concept_check.h"
- "/usr/include/c++/4.8/bits/cpp_type_traits.h"
- "/usr/include/c++/4.8/bits/cxxabi_forced.h"
- "/usr/include/c++/4.8/bits/exception_defines.h"
- "/usr/include/c++/4.8/bits/fstream.tcc"
- "/usr/include/c++/4.8/bits/functexcept.h"
- "/usr/include/c++/4.8/bits/ios_base.h"
- "/usr/include/c++/4.8/bits/istream.tcc"
- "/usr/include/c++/4.8/bits/locale_classes.h"
- "/usr/include/c++/4.8/bits/locale_classes.tcc"
- "/usr/include/c++/4.8/bits/locale_facets.h"
- "/usr/include/c++/4.8/bits/locale_facets.tcc"
- "/usr/include/c++/4.8/bits/localefwd.h"
- "/usr/include/c++/4.8/bits/memoryfwd.h"
- "/usr/include/c++/4.8/bits/move.h"
- "/usr/include/c++/4.8/bits/ostream.tcc"
- "/usr/include/c++/4.8/bits/ostream_insert.h"
- "/usr/include/c++/4.8/bits/postypes.h"
- "/usr/include/c++/4.8/bits/range_access.h"
- "/usr/include/c++/4.8/bits/sstream.tcc"
- "/usr/include/c++/4.8/bits/stl_algo.h"
- "/usr/include/c++/4.8/bits/stl_algobase.h"
- "/usr/include/c++/4.8/bits/stl_bvector.h"
- "/usr/include/c++/4.8/bits/stl_construct.h"
- "/usr/include/c++/4.8/bits/stl_function.h"
- "/usr/include/c++/4.8/bits/stl_heap.h"
- "/usr/include/c++/4.8/bits/stl_iterator.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_funcs.h"
- "/usr/include/c++/4.8/bits/stl_iterator_base_types.h"
- "/usr/include/c++/4.8/bits/stl_map.h"
- "/usr/include/c++/4.8/bits/stl_multimap.h"
- "/usr/include/c++/4.8/bits/stl_multiset.h"
- "/usr/include/c++/4.8/bits/stl_pair.h"
- "/usr/include/c++/4.8/bits/stl_raw_storage_iter.h"
- "/usr/include/c++/4.8/bits/stl_relops.h"
- "/usr/include/c++/4.8/bits/stl_set.h"
- "/usr/include/c++/4.8/bits/stl_tempbuf.h"
- "/usr/include/c++/4.8/bits/stl_tree.h"
- "/usr/include/c++/4.8/bits/stl_uninitialized.h"
- "/usr/include/c++/4.8/bits/stl_vector.h"
- "/usr/include/c++/4.8/bits/stream_iterator.h"
- "/usr/include/c++/4.8/bits/streambuf.tcc"
- "/usr/include/c++/4.8/bits/streambuf_iterator.h"
- "/usr/include/c++/4.8/bits/stringfwd.h"
- "/usr/include/c++/4.8/bits/vector.tcc"
- "/usr/include/c++/4.8/cctype"
- "/usr/include/c++/4.8/climits"
- "/usr/include/c++/4.8/clocale"
- "/usr/include/c++/4.8/cmath"
- "/usr/include/c++/4.8/cstddef"
- "/usr/include/c++/4.8/cstdio"
- "/usr/include/c++/4.8/cstdlib"
- "/usr/include/c++/4.8/cstring"
- "/usr/include/c++/4.8/cwchar"
- "/usr/include/c++/4.8/cwctype"
- "/usr/include/c++/4.8/cxxabi.h"
- "/usr/include/c++/4.8/debug/debug.h"
- "/usr/include/c++/4.8/exception"
- "/usr/include/c++/4.8/ext/alloc_traits.h"
- "/usr/include/c++/4.8/ext/atomicity.h"
- "/usr/include/c++/4.8/ext/new_allocator.h"
- "/usr/include/c++/4.8/ext/numeric_traits.h"
- "/usr/include/c++/4.8/ext/type_traits.h"
- "/usr/include/c++/4.8/fstream"
- "/usr/include/c++/4.8/functional"
- "/usr/include/c++/4.8/ios"
- "/usr/include/c++/4.8/iosfwd"
- "/usr/include/c++/4.8/iostream"
- "/usr/include/c++/4.8/istream"
- "/usr/include/c++/4.8/iterator"
- "/usr/include/c++/4.8/limits"
- "/usr/include/c++/4.8/map"
- "/usr/include/c++/4.8/memory"
- "/usr/include/c++/4.8/new"
- "/usr/include/c++/4.8/ostream"
- "/usr/include/c++/4.8/set"
- "/usr/include/c++/4.8/sstream"
- "/usr/include/c++/4.8/stdexcept"
- "/usr/include/c++/4.8/streambuf"
- "/usr/include/c++/4.8/string"
- "/usr/include/c++/4.8/typeinfo"
- "/usr/include/c++/4.8/utility"
- "/usr/include/c++/4.8/vector"
- "/usr/include/ctype.h"
- "/usr/include/endian.h"
- "/usr/include/errno.h"
- "/usr/include/features.h"
- "/usr/include/getopt.h"
- "/usr/include/inttypes.h"
- "/usr/include/libio.h"
- "/usr/include/limits.h"
- "/usr/include/linux/errno.h"
- "/usr/include/linux/limits.h"
- "/usr/include/locale.h"
- "/usr/include/math.h"
- "/usr/include/pthread.h"
- "/usr/include/sched.h"
- "/usr/include/stdc-predef.h"
- "/usr/include/stdint.h"
- "/usr/include/stdio.h"
- "/usr/include/stdlib.h"
- "/usr/include/string.h"
- "/usr/include/time.h"
- "/usr/include/unistd.h"
- "/usr/include/wchar.h"
- "/usr/include/wctype.h"
- "/usr/include/x86_64-linux-gnu/asm/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap-16.h"
- "/usr/include/x86_64-linux-gnu/bits/byteswap.h"
- "/usr/include/x86_64-linux-gnu/bits/confname.h"
- "/usr/include/x86_64-linux-gnu/bits/endian.h"
- "/usr/include/x86_64-linux-gnu/bits/environments.h"
- "/usr/include/x86_64-linux-gnu/bits/errno.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_val.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_valf.h"
- "/usr/include/x86_64-linux-gnu/bits/huge_vall.h"
- "/usr/include/x86_64-linux-gnu/bits/inf.h"
- "/usr/include/x86_64-linux-gnu/bits/local_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/locale.h"
- "/usr/include/x86_64-linux-gnu/bits/mathcalls.h"
- "/usr/include/x86_64-linux-gnu/bits/mathdef.h"
- "/usr/include/x86_64-linux-gnu/bits/mathinline.h"
- "/usr/include/x86_64-linux-gnu/bits/nan.h"
- "/usr/include/x86_64-linux-gnu/bits/posix1_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix2_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/posix_opt.h"
- "/usr/include/x86_64-linux-gnu/bits/pthreadtypes.h"
- "/usr/include/x86_64-linux-gnu/bits/sched.h"
- "/usr/include/x86_64-linux-gnu/bits/select.h"
- "/usr/include/x86_64-linux-gnu/bits/select2.h"
- "/usr/include/x86_64-linux-gnu/bits/setjmp.h"
- "/usr/include/x86_64-linux-gnu/bits/sigset.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio2.h"
- "/usr/include/x86_64-linux-gnu/bits/stdio_lim.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-bsearch.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib-float.h"
- "/usr/include/x86_64-linux-gnu/bits/stdlib.h"
- "/usr/include/x86_64-linux-gnu/bits/string3.h"
- "/usr/include/x86_64-linux-gnu/bits/sys_errlist.h"
- "/usr/include/x86_64-linux-gnu/bits/time.h"
- "/usr/include/x86_64-linux-gnu/bits/timex.h"
- "/usr/include/x86_64-linux-gnu/bits/types.h"
- "/usr/include/x86_64-linux-gnu/bits/typesizes.h"
- "/usr/include/x86_64-linux-gnu/bits/unistd.h"
- "/usr/include/x86_64-linux-gnu/bits/waitflags.h"
- "/usr/include/x86_64-linux-gnu/bits/waitstatus.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar.h"
- "/usr/include/x86_64-linux-gnu/bits/wchar2.h"
- "/usr/include/x86_64-linux-gnu/bits/wordsize.h"
- "/usr/include/x86_64-linux-gnu/bits/xopen_lim.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/atomic_word.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/basic_file.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++allocator.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++config.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++io.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/c++locale.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cpu_defines.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_base.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/ctype_inline.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/cxxabi_tweaks.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr-default.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/gthr.h"
- "/usr/include/x86_64-linux-gnu/c++/4.8/bits/os_defines.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs-64.h"
- "/usr/include/x86_64-linux-gnu/gnu/stubs.h"
- "/usr/include/x86_64-linux-gnu/sys/cdefs.h"
- "/usr/include/x86_64-linux-gnu/sys/select.h"
- "/usr/include/x86_64-linux-gnu/sys/sysmacros.h"
- "/usr/include/x86_64-linux-gnu/sys/types.h"
- "/usr/include/xlocale.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/limits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed/syslimits.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/emmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mm_malloc.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/mmintrin.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdarg.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stddef.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/stdint.h"
- "/usr/lib/gcc/x86_64-linux-gnu/4.8/include/xmmintrin.h"
- "/usr/local/cuda-6.5/include/CL/cl.h"
- "/usr/local/cuda-6.5/include/CL/cl_ext.h"
- "/usr/local/cuda-6.5/include/CL/cl_platform.h"
- "/usr/local/cuda-6.5/include/builtin_types.h"
- "/usr/local/cuda-6.5/include/channel_descriptor.h"
- "/usr/local/cuda-6.5/include/common_functions.h"
- "/usr/local/cuda-6.5/include/cuComplex.h"
- "/usr/local/cuda-6.5/include/cublas_api.h"
- "/usr/local/cuda-6.5/include/cublas_v2.h"
- "/usr/local/cuda-6.5/include/cuda.h"
- "/usr/local/cuda-6.5/include/cuda_device_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_runtime.h"
- "/usr/local/cuda-6.5/include/cuda_runtime_api.h"
- "/usr/local/cuda-6.5/include/cuda_surface_types.h"
- "/usr/local/cuda-6.5/include/cuda_texture_types.h"
- "/usr/local/cuda-6.5/include/curand.h"
- "/usr/local/cuda-6.5/include/device_functions.h"
- "/usr/local/cuda-6.5/include/device_launch_parameters.h"
- "/usr/local/cuda-6.5/include/device_types.h"
- "/usr/local/cuda-6.5/include/driver_functions.h"
- "/usr/local/cuda-6.5/include/driver_types.h"
- "/usr/local/cuda-6.5/include/host_config.h"
- "/usr/local/cuda-6.5/include/host_defines.h"
- "/usr/local/cuda-6.5/include/math_functions.h"
- "/usr/local/cuda-6.5/include/math_functions_dbl_ptx3.h"
- "/usr/local/cuda-6.5/include/sm_11_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_12_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_13_double_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_20_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_30_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_32_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_32_intrinsics.h"
- "/usr/local/cuda-6.5/include/sm_35_atomic_functions.h"
- "/usr/local/cuda-6.5/include/sm_35_intrinsics.h"
- "/usr/local/cuda-6.5/include/surface_functions.h"
- "/usr/local/cuda-6.5/include/surface_indirect_functions.h"
- "/usr/local/cuda-6.5/include/surface_types.h"
- "/usr/local/cuda-6.5/include/texture_fetch_functions.h"
- "/usr/local/cuda-6.5/include/texture_indirect_functions.h"
- "/usr/local/cuda-6.5/include/texture_types.h"
- "/usr/local/cuda-6.5/include/thrust/advance.h"
- "/usr/local/cuda-6.5/include/thrust/detail/advance.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/allocator_traits.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/copy_construct_range.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/default_construct_range.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/destroy_range.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/fill_construct_range.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/no_throw_allocator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/tagged_allocator.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/allocator/temporary_allocator.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/config.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/compiler.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/compiler_fence.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/config.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/debug.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/device_system.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/forceinline.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/global_workarounds.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/hd_warning_disable.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/host_device.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/host_system.h"
- "/usr/local/cuda-6.5/include/thrust/detail/config/simple_defines.h"
- "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.h"
- "/usr/local/cuda-6.5/include/thrust/detail/contiguous_storage.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/copy.h"
- "/usr/local/cuda-6.5/include/thrust/detail/copy.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/cstdint.h"
- "/usr/local/cuda-6.5/include/thrust/detail/device_free.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/device_malloc.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/device_ptr.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/device_reference.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/device_vector.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/dispatch/is_trivial_copy.h"
- "/usr/local/cuda-6.5/include/thrust/detail/distance.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/equal.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/execution_policy.h"
- "/usr/local/cuda-6.5/include/thrust/detail/extrema.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/fill.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/find.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/for_each.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/function.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/actor.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/argument.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/composite.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/arithmetic_operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/assignment_operator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/bitwise_operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/compound_assignment_operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/logical_operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/operator_adaptors.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/operators/relational_operators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/placeholder.h"
- "/usr/local/cuda-6.5/include/thrust/detail/functional/value.h"
- "/usr/local/cuda-6.5/include/thrust/detail/generate.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/host_vector.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/internal_functional.h"
- "/usr/local/cuda-6.5/include/thrust/detail/malloc_and_free.h"
- "/usr/local/cuda-6.5/include/thrust/detail/minmax.h"
- "/usr/local/cuda-6.5/include/thrust/detail/mismatch.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/numeric_traits.h"
- "/usr/local/cuda-6.5/include/thrust/detail/overlapped_copy.h"
- "/usr/local/cuda-6.5/include/thrust/detail/pair.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/pointer.h"
- "/usr/local/cuda-6.5/include/thrust/detail/pointer.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/raw_pointer_cast.h"
- "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.h"
- "/usr/local/cuda-6.5/include/thrust/detail/raw_reference_cast.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/reduce.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/reference.h"
- "/usr/local/cuda-6.5/include/thrust/detail/reference.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/reference_forward_declaration.h"
- "/usr/local/cuda-6.5/include/thrust/detail/replace.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/scan.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/scatter.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/static_assert.h"
- "/usr/local/cuda-6.5/include/thrust/detail/swap.h"
- "/usr/local/cuda-6.5/include/thrust/detail/swap.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/swap_ranges.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.h"
- "/usr/local/cuda-6.5/include/thrust/detail/temporary_array.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/temporary_buffer.h"
- "/usr/local/cuda-6.5/include/thrust/detail/transform.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/transform_reduce.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/tuple.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/tuple_meta_transform.h"
- "/usr/local/cuda-6.5/include/thrust/detail/tuple_transform.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/function_traits.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_member_function.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_nested_type.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/has_trivial_assign.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_call_possible.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/is_metafunction_defined.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_discard_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/iterator/is_output_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/minimum_type.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/pointer_traits.h"
- "/usr/local/cuda-6.5/include/thrust/detail/type_traits/result_of.h"
- "/usr/local/cuda-6.5/include/thrust/detail/uninitialized_fill.inl"
- "/usr/local/cuda-6.5/include/thrust/detail/use_default.h"
- "/usr/local/cuda-6.5/include/thrust/detail/util/align.h"
- "/usr/local/cuda-6.5/include/thrust/detail/util/blocking.h"
- "/usr/local/cuda-6.5/include/thrust/detail/vector_base.h"
- "/usr/local/cuda-6.5/include/thrust/detail/vector_base.inl"
- "/usr/local/cuda-6.5/include/thrust/device_free.h"
- "/usr/local/cuda-6.5/include/thrust/device_malloc.h"
- "/usr/local/cuda-6.5/include/thrust/device_malloc_allocator.h"
- "/usr/local/cuda-6.5/include/thrust/device_ptr.h"
- "/usr/local/cuda-6.5/include/thrust/device_reference.h"
- "/usr/local/cuda-6.5/include/thrust/device_vector.h"
- "/usr/local/cuda-6.5/include/thrust/distance.h"
- "/usr/local/cuda-6.5/include/thrust/equal.h"
- "/usr/local/cuda-6.5/include/thrust/extrema.h"
- "/usr/local/cuda-6.5/include/thrust/fill.h"
- "/usr/local/cuda-6.5/include/thrust/find.h"
- "/usr/local/cuda-6.5/include/thrust/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/functional.h"
- "/usr/local/cuda-6.5/include/thrust/generate.h"
- "/usr/local/cuda-6.5/include/thrust/host_vector.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/counting_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_assign.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/any_system_tag.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/counting_iterator.inl"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/device_system_tag.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/discard_iterator_base.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/distance_from_result.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/host_system_tag.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_iterator_category.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/is_trivial_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_adaptor_base.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_system.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_category_to_traversal.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_facade_category.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traits.inl"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/iterator_traversal_tags.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_category.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/minimum_system.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/normal_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/permutation_iterator_base.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator.inl"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/reverse_iterator_base.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/tagged_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/transform_iterator.inl"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/tuple_of_iterator_references.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/universal_categories.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator.inl"
- "/usr/local/cuda-6.5/include/thrust/iterator/detail/zip_iterator_base.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/discard_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/iterator_adaptor.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/iterator_categories.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/iterator_facade.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/iterator_traits.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/permutation_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/reverse_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/transform_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/iterator/zip_iterator.h"
- "/usr/local/cuda-6.5/include/thrust/memory.h"
- "/usr/local/cuda-6.5/include/thrust/mismatch.h"
- "/usr/local/cuda-6.5/include/thrust/pair.h"
- "/usr/local/cuda-6.5/include/thrust/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/replace.h"
- "/usr/local/cuda-6.5/include/thrust/scan.h"
- "/usr/local/cuda-6.5/include/thrust/scatter.h"
- "/usr/local/cuda-6.5/include/thrust/swap.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/assign_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/execution_policy.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/extrema.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/find.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/generate.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/get_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/iter_swap.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/malloc_and_free.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/reduce_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/scan_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/swap_ranges.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/temporary_buffer.h"
- "/usr/local/cuda-6.5/include/thrust/system/cpp/detail/transform.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/assign_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/inclusive_scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/block/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_cross_system.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/copy_device_to_device.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/cuda_launch_config.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/default_decomposition.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/alignment.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/fast_scan.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_calculator.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/launch_closure.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/detail/uninitialized.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/error.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/execution_policy.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/extern_shared_ptr.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/fill.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/for_each.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/get_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/iter_swap.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/malloc_and_free.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_by_key.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/reduce_intervals.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/runtime_introspection.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/scan.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/swap_ranges.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/synchronize.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/transform.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/detail/trivial_copy.inl"
- "/usr/local/cuda-6.5/include/thrust/system/cuda/error.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/assign_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/equal.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/extrema.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/fill.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/find.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/generate.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/get_value.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/iter_swap.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/malloc_and_free.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/mismatch.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/reduce_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/replace.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scan_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/scatter.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/swap_ranges.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/temporary_buffer.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/transform_reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/adl/uninitialized_fill.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/bad_alloc.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/errno.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/error_category.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/error_code.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/error_condition.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/advance.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/copy.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/distance.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/equal.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/extrema.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/fill.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/find.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/generate.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/memory.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/mismatch.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/reduce_by_key.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/replace.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scan_by_key.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/scatter.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/select_system.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/swap_ranges.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/tag.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/temporary_buffer.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/transform_reduce.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/type_traits.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/generic/uninitialized_fill.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/decompose.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/copy.inl"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/extrema.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/find.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/for_each.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/general_copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/reduce_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/scan_by_key.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/internal/scalar/trivial_copy.h"
- "/usr/local/cuda-6.5/include/thrust/system/detail/system_error.inl"
- "/usr/local/cuda-6.5/include/thrust/system/error_code.h"
- "/usr/local/cuda-6.5/include/thrust/system/system_error.h"
- "/usr/local/cuda-6.5/include/thrust/system_error.h"
- "/usr/local/cuda-6.5/include/thrust/transform.h"
- "/usr/local/cuda-6.5/include/thrust/transform_reduce.h"
- "/usr/local/cuda-6.5/include/thrust/tuple.h"
- "/usr/local/cuda-6.5/include/thrust/uninitialized_fill.h"
- "/usr/local/cuda-6.5/include/vector_functions.h"
- "/usr/local/cuda-6.5/include/vector_types.h"
- "/usr/local/include/boost/assert.hpp"
- "/usr/local/include/boost/checked_delete.hpp"
- "/usr/local/include/boost/config.hpp"
- "/usr/local/include/boost/config/compiler/gcc.hpp"
- "/usr/local/include/boost/config/compiler/nvcc.hpp"
- "/usr/local/include/boost/config/no_tr1/memory.hpp"
- "/usr/local/include/boost/config/no_tr1/utility.hpp"
- "/usr/local/include/boost/config/platform/linux.hpp"
- "/usr/local/include/boost/config/posix_features.hpp"
- "/usr/local/include/boost/config/select_compiler_config.hpp"
- "/usr/local/include/boost/config/select_platform_config.hpp"
- "/usr/local/include/boost/config/select_stdlib_config.hpp"
- "/usr/local/include/boost/config/stdlib/libstdcpp3.hpp"
- "/usr/local/include/boost/config/suffix.hpp"
- "/usr/local/include/boost/config/user.hpp"
- "/usr/local/include/boost/core/checked_delete.hpp"
- "/usr/local/include/boost/core/demangle.hpp"
- "/usr/local/include/boost/core/typeinfo.hpp"
- "/usr/local/include/boost/current_function.hpp"
- "/usr/local/include/boost/detail/sp_typeinfo.hpp"
- "/usr/local/include/boost/detail/workaround.hpp"
- "/usr/local/include/boost/exception/exception.hpp"
- "/usr/local/include/boost/predef.h"
- "/usr/local/include/boost/predef/architecture.h"
- "/usr/local/include/boost/predef/architecture/alpha.h"
- "/usr/local/include/boost/predef/architecture/arm.h"
- "/usr/local/include/boost/predef/architecture/blackfin.h"
- "/usr/local/include/boost/predef/architecture/convex.h"
- "/usr/local/include/boost/predef/architecture/ia64.h"
- "/usr/local/include/boost/predef/architecture/m68k.h"
- "/usr/local/include/boost/predef/architecture/mips.h"
- "/usr/local/include/boost/predef/architecture/parisc.h"
- "/usr/local/include/boost/predef/architecture/ppc.h"
- "/usr/local/include/boost/predef/architecture/pyramid.h"
- "/usr/local/include/boost/predef/architecture/rs6k.h"
- "/usr/local/include/boost/predef/architecture/sparc.h"
- "/usr/local/include/boost/predef/architecture/superh.h"
- "/usr/local/include/boost/predef/architecture/sys370.h"
- "/usr/local/include/boost/predef/architecture/sys390.h"
- "/usr/local/include/boost/predef/architecture/x86.h"
- "/usr/local/include/boost/predef/architecture/x86/32.h"
- "/usr/local/include/boost/predef/architecture/x86/64.h"
- "/usr/local/include/boost/predef/architecture/z.h"
- "/usr/local/include/boost/predef/compiler.h"
- "/usr/local/include/boost/predef/compiler/borland.h"
- "/usr/local/include/boost/predef/compiler/clang.h"
- "/usr/local/include/boost/predef/compiler/comeau.h"
- "/usr/local/include/boost/predef/compiler/compaq.h"
- "/usr/local/include/boost/predef/compiler/diab.h"
- "/usr/local/include/boost/predef/compiler/digitalmars.h"
- "/usr/local/include/boost/predef/compiler/dignus.h"
- "/usr/local/include/boost/predef/compiler/edg.h"
- "/usr/local/include/boost/predef/compiler/ekopath.h"
- "/usr/local/include/boost/predef/compiler/gcc.h"
- "/usr/local/include/boost/predef/compiler/gcc_xml.h"
- "/usr/local/include/boost/predef/compiler/greenhills.h"
- "/usr/local/include/boost/predef/compiler/hp_acc.h"
- "/usr/local/include/boost/predef/compiler/iar.h"
- "/usr/local/include/boost/predef/compiler/ibm.h"
- "/usr/local/include/boost/predef/compiler/intel.h"
- "/usr/local/include/boost/predef/compiler/kai.h"
- "/usr/local/include/boost/predef/compiler/llvm.h"
- "/usr/local/include/boost/predef/compiler/metaware.h"
- "/usr/local/include/boost/predef/compiler/metrowerks.h"
- "/usr/local/include/boost/predef/compiler/microtec.h"
- "/usr/local/include/boost/predef/compiler/mpw.h"
- "/usr/local/include/boost/predef/compiler/palm.h"
- "/usr/local/include/boost/predef/compiler/pgi.h"
- "/usr/local/include/boost/predef/compiler/sgi_mipspro.h"
- "/usr/local/include/boost/predef/compiler/sunpro.h"
- "/usr/local/include/boost/predef/compiler/tendra.h"
- "/usr/local/include/boost/predef/compiler/visualc.h"
- "/usr/local/include/boost/predef/compiler/watcom.h"
- "/usr/local/include/boost/predef/detail/_cassert.h"
- "/usr/local/include/boost/predef/detail/_exception.h"
- "/usr/local/include/boost/predef/detail/comp_detected.h"
- "/usr/local/include/boost/predef/detail/os_detected.h"
- "/usr/local/include/boost/predef/detail/test.h"
- "/usr/local/include/boost/predef/language.h"
- "/usr/local/include/boost/predef/language/objc.h"
- "/usr/local/include/boost/predef/language/stdc.h"
- "/usr/local/include/boost/predef/language/stdcpp.h"
- "/usr/local/include/boost/predef/library.h"
- "/usr/local/include/boost/predef/library/c.h"
- "/usr/local/include/boost/predef/library/c/_prefix.h"
- "/usr/local/include/boost/predef/library/c/gnu.h"
- "/usr/local/include/boost/predef/library/c/uc.h"
- "/usr/local/include/boost/predef/library/c/vms.h"
- "/usr/local/include/boost/predef/library/c/zos.h"
- "/usr/local/include/boost/predef/library/std.h"
- "/usr/local/include/boost/predef/library/std/_prefix.h"
- "/usr/local/include/boost/predef/library/std/cxx.h"
- "/usr/local/include/boost/predef/library/std/dinkumware.h"
- "/usr/local/include/boost/predef/library/std/libcomo.h"
- "/usr/local/include/boost/predef/library/std/modena.h"
- "/usr/local/include/boost/predef/library/std/msl.h"
- "/usr/local/include/boost/predef/library/std/roguewave.h"
- "/usr/local/include/boost/predef/library/std/sgi.h"
- "/usr/local/include/boost/predef/library/std/stdcpp3.h"
- "/usr/local/include/boost/predef/library/std/stlport.h"
- "/usr/local/include/boost/predef/library/std/vacpp.h"
- "/usr/local/include/boost/predef/make.h"
- "/usr/local/include/boost/predef/os.h"
- "/usr/local/include/boost/predef/os/aix.h"
- "/usr/local/include/boost/predef/os/amigaos.h"
- "/usr/local/include/boost/predef/os/android.h"
- "/usr/local/include/boost/predef/os/beos.h"
- "/usr/local/include/boost/predef/os/bsd.h"
- "/usr/local/include/boost/predef/os/bsd/bsdi.h"
- "/usr/local/include/boost/predef/os/bsd/dragonfly.h"
- "/usr/local/include/boost/predef/os/bsd/free.h"
- "/usr/local/include/boost/predef/os/bsd/net.h"
- "/usr/local/include/boost/predef/os/bsd/open.h"
- "/usr/local/include/boost/predef/os/cygwin.h"
- "/usr/local/include/boost/predef/os/hpux.h"
- "/usr/local/include/boost/predef/os/ios.h"
- "/usr/local/include/boost/predef/os/irix.h"
- "/usr/local/include/boost/predef/os/linux.h"
- "/usr/local/include/boost/predef/os/macos.h"
- "/usr/local/include/boost/predef/os/os400.h"
- "/usr/local/include/boost/predef/os/qnxnto.h"
- "/usr/local/include/boost/predef/os/solaris.h"
- "/usr/local/include/boost/predef/os/unix.h"
- "/usr/local/include/boost/predef/os/vms.h"
- "/usr/local/include/boost/predef/os/windows.h"
- "/usr/local/include/boost/predef/other.h"
- "/usr/local/include/boost/predef/other/endian.h"
- "/usr/local/include/boost/predef/platform.h"
- "/usr/local/include/boost/predef/platform/mingw.h"
- "/usr/local/include/boost/predef/platform/windows_desktop.h"
- "/usr/local/include/boost/predef/platform/windows_phone.h"
- "/usr/local/include/boost/predef/platform/windows_runtime.h"
- "/usr/local/include/boost/predef/platform/windows_store.h"
- "/usr/local/include/boost/predef/version_number.h"
- "/usr/local/include/boost/shared_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/bad_weak_ptr.hpp"
- "/usr/local/include/boost/smart_ptr/detail/operator_bool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/shared_count.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_convertible.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_base_gcc_x86.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_counted_impl.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_has_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/sp_nullptr_t.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_pool.hpp"
- "/usr/local/include/boost/smart_ptr/detail/spinlock_sync.hpp"
- "/usr/local/include/boost/smart_ptr/detail/yield_k.hpp"
- "/usr/local/include/boost/smart_ptr/shared_ptr.hpp"
- "/usr/local/include/boost/throw_exception.hpp"
- "/usr/local/include/gflags/gflags.h"
- "/usr/local/include/gflags/gflags_declare.h"
- "/usr/local/include/glog/log_severity.h"
- "/usr/local/include/glog/logging.h"
- "/usr/local/include/glog/vlog_is_on.h"
-)
-
diff --git a/src/caffe/CMakeFiles/progress.marks b/src/caffe/CMakeFiles/progress.marks
deleted file mode 100644
index abdfb053..00000000
--- a/src/caffe/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-60
diff --git a/src/caffe/CMakeFiles/proto.dir/CXX.includecache b/src/caffe/CMakeFiles/proto.dir/CXX.includecache
deleted file mode 100644
index df68b9a9..00000000
--- a/src/caffe/CMakeFiles/proto.dir/CXX.includecache
+++ /dev/null
@@ -1,48 +0,0 @@
-#IncludeRegexLine: ^[ 	]*#[ 	]*(include|import)[ 	]*[<"]([^">]+)([">])
-
-#IncludeRegexScan: ^.*$
-
-#IncludeRegexComplain: ^$
-
-#IncludeRegexTransform: 
-
-/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
-caffe.pb.h
-/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
-algorithm
--
-google/protobuf/stubs/common.h
--
-google/protobuf/stubs/once.h
--
-google/protobuf/io/coded_stream.h
--
-google/protobuf/wire_format_lite_inl.h
--
-google/protobuf/descriptor.h
--
-google/protobuf/generated_message_reflection.h
--
-google/protobuf/reflection_ops.h
--
-google/protobuf/wire_format.h
--
-
-/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
-string
--
-google/protobuf/stubs/common.h
--
-google/protobuf/generated_message_util.h
--
-google/protobuf/message.h
--
-google/protobuf/repeated_field.h
--
-google/protobuf/extension_set.h
--
-google/protobuf/generated_enum_reflection.h
--
-google/protobuf/unknown_field_set.h
--
-
diff --git a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake b/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
deleted file mode 100644
index 44c81e52..00000000
--- a/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# The set of languages for which implicit dependencies are needed:
-SET(CMAKE_DEPENDS_LANGUAGES
-  "CXX"
-  )
-# The set of files for implicit dependencies of each language:
-SET(CMAKE_DEPENDS_CHECK_CXX
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
-  )
-SET(CMAKE_CXX_COMPILER_ID "GNU")
-
-# Preprocessor definitions for this target.
-SET(CMAKE_TARGET_DEFINITIONS
-  "GTEST_USE_OWN_TR1_TUPLE"
-  )
-
-# Pairs of files generated by the same build rule.
-SET(CMAKE_MULTIPLE_OUTPUT_PAIRS
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc"
-  )
-
-
-# Targets to which this target links.
-SET(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# The include file search paths:
-SET(CMAKE_C_TARGET_INCLUDE_PATH
-  "src"
-  "/usr/local/include"
-  "include"
-  "/usr/local/cuda/include"
-  "/usr/local/include/opencv"
-  "/usr/include/atlas"
-  "."
-  )
-SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/CMakeFiles/proto.dir/build.make b/src/caffe/CMakeFiles/proto.dir/build.make
deleted file mode 100644
index 1467c124..00000000
--- a/src/caffe/CMakeFiles/proto.dir/build.make
+++ /dev/null
@@ -1,119 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# Include any dependencies generated for this target.
-include src/caffe/CMakeFiles/proto.dir/depend.make
-
-# Include the progress variables for this target.
-include src/caffe/CMakeFiles/proto.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include src/caffe/CMakeFiles/proto.dir/flags.make
-
-include/caffe/proto/caffe.pb.cc: src/caffe/proto/caffe.proto
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Running C++/Python protocol buffer compiler on /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --cpp_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/protoc --python_out /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto -I /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/proto/caffe.proto
-
-include/caffe/proto/caffe.pb.h: include/caffe/proto/caffe.pb.cc
-
-include/caffe/proto/caffe_pb2.py: include/caffe/proto/caffe.pb.cc
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: src/caffe/CMakeFiles/proto.dir/flags.make
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc > CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc -o CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires:
-.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
-	$(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build
-.PHONY : src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.provides.build: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
-
-# Object files for target proto
-proto_OBJECTS = \
-"CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
-
-# External object files for target proto
-proto_EXTERNAL_OBJECTS =
-
-lib/libproto.a: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
-lib/libproto.a: src/caffe/CMakeFiles/proto.dir/build.make
-lib/libproto.a: src/caffe/CMakeFiles/proto.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libproto.a"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean_target.cmake
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/proto.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-src/caffe/CMakeFiles/proto.dir/build: lib/libproto.a
-.PHONY : src/caffe/CMakeFiles/proto.dir/build
-
-src/caffe/CMakeFiles/proto.dir/requires: src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o.requires
-.PHONY : src/caffe/CMakeFiles/proto.dir/requires
-
-src/caffe/CMakeFiles/proto.dir/clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe && $(CMAKE_COMMAND) -P CMakeFiles/proto.dir/cmake_clean.cmake
-.PHONY : src/caffe/CMakeFiles/proto.dir/clean
-
-src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.cc
-src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe.pb.h
-src/caffe/CMakeFiles/proto.dir/depend: include/caffe/proto/caffe_pb2.py
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : src/caffe/CMakeFiles/proto.dir/depend
-
diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
deleted file mode 100644
index 79cb425a..00000000
--- a/src/caffe/CMakeFiles/proto.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-FILE(REMOVE_RECURSE
-  "../../include/caffe/proto/caffe.pb.cc"
-  "../../include/caffe/proto/caffe.pb.h"
-  "../../include/caffe/proto/caffe_pb2.py"
-  "CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o"
-  "../../lib/libproto.pdb"
-  "../../lib/libproto.a"
-)
-
-# Per-language clean rules from dependency scanning.
-FOREACH(lang CXX)
-  INCLUDE(CMakeFiles/proto.dir/cmake_clean_${lang}.cmake OPTIONAL)
-ENDFOREACH(lang)
diff --git a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake b/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
deleted file mode 100644
index 6172b692..00000000
--- a/src/caffe/CMakeFiles/proto.dir/cmake_clean_target.cmake
+++ /dev/null
@@ -1,3 +0,0 @@
-FILE(REMOVE_RECURSE
-  "../../lib/libproto.a"
-)
diff --git a/src/caffe/CMakeFiles/proto.dir/depend.internal b/src/caffe/CMakeFiles/proto.dir/depend.internal
deleted file mode 100644
index 2f8ec677..00000000
--- a/src/caffe/CMakeFiles/proto.dir/depend.internal
+++ /dev/null
@@ -1,6 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
- /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.cc
- /home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h
diff --git a/src/caffe/CMakeFiles/proto.dir/depend.make b/src/caffe/CMakeFiles/proto.dir/depend.make
deleted file mode 100644
index 239c4242..00000000
--- a/src/caffe/CMakeFiles/proto.dir/depend.make
+++ /dev/null
@@ -1,6 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.cc
-src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o: include/caffe/proto/caffe.pb.h
-
diff --git a/src/caffe/CMakeFiles/proto.dir/flags.make b/src/caffe/CMakeFiles/proto.dir/flags.make
deleted file mode 100644
index 8b4ef992..00000000
--- a/src/caffe/CMakeFiles/proto.dir/flags.make
+++ /dev/null
@@ -1,8 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# compile CXX with /usr/bin/c++
-CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
-
-CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
-
diff --git a/src/caffe/CMakeFiles/proto.dir/link.txt b/src/caffe/CMakeFiles/proto.dir/link.txt
deleted file mode 100644
index 42f85bda..00000000
--- a/src/caffe/CMakeFiles/proto.dir/link.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-/usr/bin/ar cr ../../lib/libproto.a  CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
-/usr/bin/ranlib ../../lib/libproto.a
diff --git a/src/caffe/CMakeFiles/proto.dir/progress.make b/src/caffe/CMakeFiles/proto.dir/progress.make
deleted file mode 100644
index 25d32761..00000000
--- a/src/caffe/CMakeFiles/proto.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 67
-CMAKE_PROGRESS_2 = 
-
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
deleted file mode 100644
index 40e6c11f..00000000
--- a/src/caffe/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# generate protobuf sources
-file(GLOB proto_files proto/*.proto)
-caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
-
-# include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend!
-caffe_default_properties(proto)
-
-# --[ Caffe library
-
-# creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists
-caffe_pickup_caffe_sources(${PROJECT_SOURCE_DIR})
-
-if(HAVE_CUDA)
-  caffe_cuda_compile(cuda_objs ${cuda})
-  list(APPEND srcs ${cuda_objs} ${cuda})
-endif()
-
-add_library(caffe ${srcs})
-target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
-caffe_default_properties(caffe)
-
-# ---[ Tests
- add_subdirectory(test)
-
-# ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
-
-file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
-list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
-install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
-
-

From 04d42ec6da24f922eee084e0b6b75f3f427db5c9 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 13 Sep 2015 23:44:02 +0800
Subject: [PATCH 101/124] Enable CPU_ONLY flag

---
 include/caffe/common.hpp                      |   5 +-
 include/caffe/device.hpp                      |   5 +-
 include/caffe/syncedmem.hpp                   |  10 +-
 include/caffe/util/im2col.hpp                 |   2 +
 include/caffe/util/math_functions.hpp         |   2 -
 include/caffe/util/ocl_util.hpp               |   3 +-
 include/caffe/util/ocl_wrapper.hpp            |   2 +
 include/caffe/vision_layers.hpp               |   3 +-
 src/caffe/device.cpp                          |   2 +
 src/caffe/layers/absval_layer.cpp             |   3 +-
 src/caffe/layers/base_conv_layer.cpp          |   5 +
 src/caffe/layers/base_data_layer.cpp          |   7 +-
 src/caffe/layers/bnll_layer.cpp               |   3 +-
 src/caffe/layers/concat_layer.cpp             |   3 +-
 src/caffe/layers/contrastive_loss_layer.cpp   |   3 +-
 src/caffe/layers/conv_layer.cpp               |   3 +-
 src/caffe/layers/deconv_layer.cpp             |   4 +-
 src/caffe/layers/dropout_layer.cpp            |   3 +-
 src/caffe/layers/eltwise_layer.cpp            |   3 +-
 src/caffe/layers/euclidean_loss_layer.cpp     |   3 +-
 src/caffe/layers/exp_layer.cpp                |   3 +-
 src/caffe/layers/filter_layer.cpp             |   3 +-
 src/caffe/layers/hdf5_data_layer.cpp          |   3 +-
 src/caffe/layers/hdf5_output_layer.cpp        |   3 +-
 src/caffe/layers/im2col_layer.cpp             |   3 +-
 src/caffe/layers/inner_product_layer.cpp      |   3 +-
 src/caffe/layers/log_layer.cpp                |   3 +-
 src/caffe/layers/lrn_layer.cpp                |   3 +-
 src/caffe/layers/mvn_layer.cpp                |   3 +-
 src/caffe/layers/pooling_layer.cpp            |   3 +-
 src/caffe/layers/power_layer.cpp              |   3 +-
 src/caffe/layers/prelu_layer.cpp              |   3 +-
 src/caffe/layers/reduction_layer.cpp          |   3 +-
 src/caffe/layers/relu_layer.cpp               |   3 +-
 .../sigmoid_cross_entropy_loss_layer.cpp      |   3 +-
 src/caffe/layers/sigmoid_layer.cpp            |   4 +-
 src/caffe/layers/silence_layer.cpp            |   3 +-
 src/caffe/layers/slice_layer.cpp              |   4 +-
 src/caffe/layers/softmax_layer.cpp            |   4 +-
 src/caffe/layers/softmax_loss_layer.cpp       |  12 +-
 src/caffe/layers/split_layer.cpp              |   5 +-
 src/caffe/layers/tanh_layer.cpp               |   3 +-
 src/caffe/layers/threshold_layer.cpp          |   3 +-
 src/caffe/net.cpp                             |   4 +
 src/caffe/solver.cpp                          |  10 -
 src/caffe/syncedmem.cpp                       |   8 +
 src/caffe/util/im2col.cpp                     |   3 +-
 src/caffe/util/math_functions.cpp             | 917 +++++++++---------
 src/caffe/util/ocl_util.cpp                   |   4 +
 src/caffe/util/ocl_wrapper.cpp                |   5 +
 50 files changed, 574 insertions(+), 534 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index f5c65eb9..7aed6007 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -1,7 +1,6 @@
 #ifndef CAFFE_COMMON_HPP_
 #define CAFFE_COMMON_HPP_
 
-#include <CL/cl_ext.h>
 #include <boost/shared_ptr.hpp>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
@@ -16,8 +15,12 @@
 #include <string>
 #include <utility>  // pair
 #include <vector>
+
+#ifndef CPU_ONLY
 #include <clBLAS.h>
 #include <CL/cl.h>
+#include <CL/cl_ext.h>
+#endif
 
 #include "caffe/device.hpp"
 #include "caffe/util/device_alternate.hpp"
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
index 1d9fa6fe..b6190f28 100644
--- a/include/caffe/device.hpp
+++ b/include/caffe/device.hpp
@@ -26,12 +26,11 @@
 
 #ifndef CAFFE_DEVICE_HPP
 #define CAFFE_DEVICE_HPP
-#include <CL/cl.h>
 #include <string>
 #include <fstream>
 #include "caffe/common.hpp"
 namespace caffe {
-
+#ifndef CPU_ONLY
 class Device {
   public:
     Device()
@@ -80,7 +79,7 @@ class Device {
 };
 extern std::string buildOption;
 extern Device amdDevice;
-
+#endif
 }  // namespace caffe
 
 #endif //CAFFE_DEVICE_HPP
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 1647b6f3..4092b5ac 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -68,12 +68,16 @@ class SyncedMemory {
     SyncedMemory()
         : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(
             false), data_layer_(false) {
-      ocl_setup();
+#ifndef CPU_ONLY
+     	ocl_setup();
+#endif
     }
     explicit SyncedMemory(size_t size)
         : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(
             false), data_layer_(false) {
-      ocl_setup();
+#ifndef CPU_ONLY
+	ocl_setup();
+#endif
     }
 
     ~SyncedMemory();
@@ -95,8 +99,10 @@ class SyncedMemory {
     void set_data_layer() {
       data_layer_ = true;
     }
+#ifndef CPU_ONLY
   private:
     void ocl_setup();
+#endif
   protected:
     cl_kernel oclmem_kernel;
 
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index f962049d..531b11ad 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -39,6 +39,7 @@ void col2im_cpu(const Dtype* data_col, const int channels, const int height,
     const int width, const int patch_h, const int patch_w, const int pad_h,
     const int pad_w, const int stride_h, const int stride_w, Dtype* data_im);
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
     const int width, const int channels, const int patch_h, const int patch_w,
@@ -97,6 +98,7 @@ template <typename Dtype>
 void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height,
     const int width, const int ksize, const int pad, const int stride,
     Dtype* data_col, cl_kernel Kernel);
+#endif
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index d7c67673..7178ea74 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -116,8 +116,6 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) {
 inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
 #ifndef CPU_ONLY
   ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N);
-#else
-  NO_GPU;
 #endif
 }
 
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 776fec11..00bfa3cf 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -28,7 +28,7 @@
 #define _CAFFE_UTIL_OCL_UTIL_HPP_
 
 namespace caffe {
-
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count);
 
@@ -36,6 +36,7 @@ void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
     const int count);
 
 void eventCallback(cl_event event, cl_int event_status, void * user_data);
+#endif
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 14cf48e9..a1d11d18 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -49,6 +49,7 @@ template <typename dtype> inline std::string get_dtype_suffix() {
   return suffix;
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
     const int M_, const int packing_num);
@@ -339,6 +340,7 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
 template <typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
     const int* mask, Dtype* bottom_diff);
+#endif
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
 // namespace caffe
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index bc6cd5de..e2a9b190 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -232,7 +232,7 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
       return false;
     }
     virtual void compute_output_shape();
-
+#ifndef CPU_ONLY
     virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
         const vector<Blob<Dtype>*>& top);
     virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
@@ -241,6 +241,7 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
         const vector<Blob<Dtype>*>& top);
     virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+#endif
 };
 
 /**
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
index bb8f9cb6..fcbffe09 100644
--- a/src/caffe/device.cpp
+++ b/src/caffe/device.cpp
@@ -33,6 +33,7 @@
 #include <dirent.h>
 
 namespace caffe {
+#ifndef CPU_ONLY
 string buildOption = "-x clc++ ";
 std::string oclKernelPath = "./src/caffe/ocl/";
 Device amdDevice;
@@ -420,5 +421,6 @@ void Device::appendBitfield(T info, T value, std::string name,
   }
 }
 
+#endif
 }  // namespace caffe
 
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 945162af..6e06b558 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -35,6 +35,7 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -57,7 +58,7 @@ void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(AbsValLayer);
 #endif
 
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ee0df02f..04cd38dd 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -9,6 +9,7 @@
 
 namespace caffe {
 
+#ifndef CPU_ONLY
 #ifdef use_packing_scheme
 template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::subtop_mem_size = sizeof(Dtype);
 template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::trans_mem_size = sizeof(Dtype);
@@ -46,6 +47,8 @@ void BaseConvolutionLayer<Dtype>::ocl_setup() {
 #endif
 }
 
+#endif
+
 template <typename Dtype>
 BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer() {
 }
@@ -204,8 +207,10 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
   }
+#ifndef CPU_ONLY
   //initializa OpenCL kernels and cl_mem objects
   ocl_setup();
+#endif
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index d02e92c4..ff4436a7 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -80,6 +80,8 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
   CreatePrefetchThread();
 }
 
+#ifndef CPU_ONLY
+
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -104,15 +106,12 @@ void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
             0, NULL, NULL));
   }
 
-#ifdef Track_data_transfer
-#endif
-
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
 #endif
 
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index c2cce9e3..ed9cc1d4 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -38,6 +38,7 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -62,7 +63,7 @@ void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(BNLLLayer);
 #endif
 
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 5a351009..5cceb9ff 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -94,6 +94,7 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -139,7 +140,7 @@ void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ConcatLayer);
 #endif
 
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index a8e6f523..6dda7d61 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -101,6 +101,7 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -156,7 +157,7 @@ void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ContrastiveLossLayer);
 #endif
 
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 9c250c42..b64eb1aa 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -70,6 +70,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -228,7 +229,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
 }
 // end: code written/modified by AMD
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ConvolutionLayer);
 #endif
 
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 8ee81c9f..2504f43a 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -69,6 +69,8 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
+
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -125,7 +127,7 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else 
 STUB_GPU(DeconvolutionLayer);
 #endif
 
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 17196f10..2cb50ead 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -66,6 +66,7 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code is written/modified by AMD
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -103,7 +104,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(DropoutLayer);
 #endif
 
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index e2e5e1ab..971703f4 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -155,6 +155,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -241,7 +242,7 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(EltwiseLayer);
 #endif
 
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index fce99953..2130c6f4 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -43,6 +43,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -72,7 +73,7 @@ void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(EuclideanLossLayer);
 #endif
 
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 25bcd0a0..3fe7cde4 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -62,6 +62,7 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -95,7 +96,7 @@ void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ExpLayer);
 #endif
 
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index fc3ca142..2cd9957d 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -118,6 +118,7 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -177,7 +178,7 @@ void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(FilterLayer);
 #endif
 
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 2d7d405e..28eee444 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -159,6 +159,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -197,7 +198,7 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU_FORWARD(HDF5DataLayer, Forward);
 #endif
 
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f9215a3d..11d01647 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -67,6 +67,7 @@ void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   return;
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -103,7 +104,7 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   return;
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(HDF5OutputLayer);
 #endif
 
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 886782b9..f51fd7cc 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -87,6 +87,7 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -111,7 +112,7 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(Im2colLayer);
 #endif
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index b9ae3370..b40e3e7d 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -119,6 +119,7 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -157,7 +158,7 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(InnerProductLayer);
 #endif
 
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index f6ace662..5dbbca74 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -79,6 +79,7 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -126,7 +127,7 @@ void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(LogLayer);
 #endif
 
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 00e554bd..da3d1fc3 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -251,6 +251,7 @@ void LRNLayer<Dtype>::WithinChannelBackward(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -309,7 +310,7 @@ void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "Unknown normalization region.";
   }
 }
-#ifdef CPU_ONLY
+#else
 STUB_GPU(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
 STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 64c3063f..2c4acb14 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -122,6 +122,7 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -228,7 +229,7 @@ void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else 
 STUB_GPU(MVNLayer);
 #endif
 
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 85c57379..0becf164 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -313,6 +313,7 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -409,7 +410,7 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 }
 
 // end: code written/modified by AMD
-#ifdef CPU_ONLY
+#else
 STUB_GPU(PoolingLayer);
 #endif
 
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 6b2c5f1d..a0f5ccee 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -96,6 +96,7 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -169,7 +170,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 // end: code written/modified by AMD
-#ifdef CPU_ONLY
+#else
 STUB_GPU(PowerLayer);
 #endif
 
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 8ec6664d..75aa3968 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -128,6 +128,7 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -197,7 +198,7 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(PReLULayer);
 #endif
 
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 89df6589..9ec057b1 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -124,6 +124,7 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -207,7 +208,7 @@ void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ReductionLayer);
 #endif
 
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index b07e6447..132d7b4b 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -34,6 +34,7 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -57,7 +58,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else 
 STUB_GPU(ReLULayer);
 #endif
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 4048a8e8..f074ac51 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -71,6 +71,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
@@ -94,7 +95,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
 #endif
 
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index a4359920..737bff74 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -39,6 +39,8 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
+
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -62,7 +64,7 @@ void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SigmoidLayer);
 #endif
 
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index 1c463499..a6c30fbb 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -16,6 +16,7 @@ void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -33,7 +34,7 @@ void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SilenceLayer);
 #endif
 
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index da4059a0..8263b92b 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -110,7 +110,7 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     offset_slice_axis += top_slice_axis;
   }
 }
-
+#ifndef CPU_ONLY
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -121,7 +121,7 @@ void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SliceLayer);
 #endif
 
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 92162821..366946bd 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -89,6 +89,8 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // elementwise multiplication
   caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
+
+#ifndef CPU_ONLY
 // begin: code written/modified by AMD
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -147,7 +149,7 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
 }
 // end: code written/modified by AMD
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SoftmaxLayer);
 #endif
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 62c10e30..2241bd6c 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -27,15 +27,6 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     ignore_label_ = this->layer_param_.loss_param().ignore_label();
   }
   normalize_ = this->layer_param_.loss_param().normalize();
-
-  ocl_setup();
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::ocl_setup() {
-  d_loss = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
-      sizeof(Dtype), NULL, NULL);
-
 }
 
 template <typename Dtype>
@@ -134,6 +125,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 // begin: code written/modified by AMD
+#ifndef CPU_ONLY
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -200,7 +192,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 // end: code written/modified by AMD
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
 
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 7a40bf8a..57677b5b 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -21,8 +21,6 @@ void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     top[i]->ReshapeLike(*bottom[0]);
     CHECK_EQ(count_, top[i]->count());
   }
-  gpu_add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float",
-      NULL);
 }
 
 template <typename Dtype>
@@ -53,6 +51,7 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef  CPU_ONLY
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -82,7 +81,7 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 // end: code written/modified by AMD
-#ifdef CPU_ONLY
+#else
 STUB_GPU(SplitLayer);
 #endif
 
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 3e85330c..7a15809d 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -37,6 +37,7 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -60,7 +61,7 @@ void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(TanHLayer);
 #endif
 
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 16ca8944..a4c543ee 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -24,6 +24,7 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -34,7 +35,7 @@ void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   ThresholdForward(count, threshold_, bottom_data, top_data);
 }
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU_FORWARD(ThresholdLayer, Forward);
 #endif
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 6911854c..711ec408 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -528,7 +528,9 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
     if (debug_info_) {
       ForwardDebugInfo(i);
     }
+#ifndef CPU_ONLY
     clFinish(amdDevice.CommandQueue);
+#endif
     layer_timer.Stop();
     printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
         layer_timer.MilliSeconds());
@@ -608,7 +610,9 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
       if (debug_info_) {
         BackwardDebugInfo(i);
       }
+#ifndef CPU_ONLY
       clFinish(amdDevice.CommandQueue);
+#endif
       layer_timer.Start();
       printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
           layer_timer.MilliSeconds());
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 8d7f8238..20af4160 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -19,14 +19,6 @@ Solver<Dtype>::Solver(const SolverParameter& param)
   Init(param);
 }
 
-template <typename Dtype>
-void Solver<Dtype>::ocl_setup() {
-  scalar_kernel = clCreateKernel(amdDevice.Program, "add_scalar_float", NULL);
-  add_kernel = clCreateKernel(amdDevice.Program, "caffe_gpu_add_float", NULL);
-  div_kernel = clCreateKernel(amdDevice.Program, "div_float", NULL);
-  powx_kernel = clCreateKernel(amdDevice.Program, "powx_float", NULL);
-}
-
 template <typename Dtype>
 Solver<Dtype>::Solver(const string& param_file)
     : net_() {
@@ -42,8 +34,6 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
 
-  ocl_setup();
-
   if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index db470434..a3fa9973 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -36,6 +36,7 @@
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
+#ifndef CPU_ONLY
   if (cpu_ptr_ && own_cpu_data_) {
     OCL_CHECK(
         clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
@@ -50,23 +51,30 @@ SyncedMemory::~SyncedMemory() {
   }
 
   clReleaseKernel (oclmem_kernel);
+#endif
 }
 
 //begin: code written/modified by AMD.
+#ifndef CPU_ONLY
 void SyncedMemory::ocl_setup() {
   cl_int err = 0;
   oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
   OCL_CHECK(err);
 }
+#endif
 
 inline void SyncedMemory::to_cpu() {
   switch (head_) {
   case UNINITIALIZED:
+#ifndef CPU_ONLY
     gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
         size_, NULL, NULL);
     cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
         (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_,
         0, NULL, NULL, NULL);
+#else
+    CaffeMallocHost(&cpu_ptr_, size_);
+#endif
     memset(cpu_ptr_, 0, size_);
     head_ = HEAD_AT_CPU;
     own_cpu_data_ = true;
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 25349d26..ab023e70 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -103,6 +103,7 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     double* data_im);
 
+#ifndef CPU_ONLY
 template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
     const int channels, const int height, const int width, const int ksize,
@@ -366,5 +367,5 @@ template void col2im_gpu<float>(const float* data_col, const int col_offset,
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
     const int channels, const int height, const int width, const int psize,
     const int pad, const int stride, double* data_im, const int img_offset);
-
+#endif
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 96ec98b1..d1cfc954 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -62,6 +62,395 @@ void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
       beta, C, N);
 }
 
+template <>
+void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) {
+  cblas_saxpy(N, alpha, X, 1, Y, 1);
+}
+
+template <>
+void caffe_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) {
+  cblas_daxpy(N, alpha, X, 1, Y, 1);
+}
+
+template <>
+void caffe_set(const int N, const float alpha, float* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(float) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template <>
+void caffe_set(const int N, const double alpha, double* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(double) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template <>
+void caffe_add_scalar(const int N, const float alpha, float* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <>
+void caffe_add_scalar(const int N, const double alpha, double* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <>
+void caffe_copy<float>(const int N, const float* X, float* Y) {
+  cblas_scopy(N, X, 1, Y, 1);
+}
+
+template <>
+void caffe_copy<double>(const int N, const double* X, double* Y) {
+  cblas_dcopy(N, X, 1, Y, 1);
+}
+
+template <>
+void caffe_scal<float>(const int N, const float alpha, float *X) {
+  cblas_sscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_scal<double>(const int N, const double alpha, double *X) {
+  cblas_dscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_add<float>(const int n, const float* a, const float* b, float* y) {
+  vsAdd(n, a, b, y);
+}
+
+template <>
+void caffe_add<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdAdd(n, a, b, y);
+}
+
+template <>
+void caffe_sub<float>(const int n, const float* a, const float* b, float* y) {
+  vsSub(n, a, b, y);
+}
+
+template <>
+void caffe_sub<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdSub(n, a, b, y);
+}
+
+template <>
+void caffe_mul<float>(const int n, const float* a, const float* b, float* y) {
+  vsMul(n, a, b, y);
+}
+
+template <>
+void caffe_mul<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdMul(n, a, b, y);
+}
+
+template <>
+float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
+}
+
+template <>
+double caffe_cpu_strided_dot<double>(const int n, const double* x,
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
+}
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template void caffe_set<int>(const int N, const int alpha, int* Y);
+template void caffe_set<float>(const int N, const float alpha, float* Y);
+template void caffe_set<double>(const int N, const double alpha, double* Y);
+
+template <>
+void caffe_log<float>(const int n, const float* a, float* y) {
+  vsLn(n, a, y);
+}
+
+template <>
+void caffe_log<double>(const int n, const double* a, double* y) {
+  vdLn(n, a, y);
+}
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+    if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+      // NOLINT_NEXT_LINE(caffe/alt_fn)
+      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+#else
+      NO_GPU;
+#endif
+    } else {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    }
+  }
+}
+
+template void caffe_copy<int>(const int N, const int* X, int* Y);
+template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
+    unsigned int* Y);
+template void caffe_copy<float>(const int N, const float* X, float* Y);
+template void caffe_copy<double>(const int N, const double* X, double* Y);
+
+template <>
+void caffe_abs<float>(const int n, const float* a, float* y) {
+  vsAbs(n, a, y);
+}
+
+template <>
+void caffe_abs<double>(const int n, const double* a, double* y) {
+  vdAbs(n, a, y);
+}
+
+template <>
+void caffe_div<float>(const int n, const float* a, const float* b, float* y) {
+  vsDiv(n, a, b, y);
+}
+
+template <>
+void caffe_div<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdDiv(n, a, b, y);
+}
+
+template <>
+void caffe_powx<float>(const int n, const float* a, const float b, float* y) {
+  vsPowx(n, a, b, y);
+}
+
+template <>
+void caffe_powx<double>(const int n, const double* a, const double b,
+    double* y) {
+  vdPowx(n, a, b, y);
+}
+
+template <>
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  vsSqr(n, a, y);
+}
+
+template <>
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  vdSqr(n, a, y);
+}
+
+template <>
+void caffe_exp<float>(const int n, const float* a, float* y) {
+  vsExp(n, a, y);
+}
+
+template <>
+void caffe_exp<double>(const int n, const double* a, double* y) {
+  vdExp(n, a, y);
+}
+
+unsigned int caffe_rng_rand() {
+  return (*caffe_rng())();
+}
+
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter < Dtype
+      > (b, std::numeric_limits < Dtype > ::max());
+}
+template float caffe_nextafter(const float b);
+template double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+  boost::uniform_real < Dtype
+      > random_distribution(a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+
+  //LOG(INFO) << "caffe_rng_uniform";
+}
+
+template void caffe_rng_uniform<float>(const int n, const float a, const float b,
+    float* r);
+template void caffe_rng_uniform<double>(const int n, const double a, const double b,
+    double* r);
+
+template <typename Dtype>
+void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma,
+    Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  boost::normal_distribution < Dtype > random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template void caffe_rng_gaussian<float>(const int n, const float mu, const float sigma,
+    float* r);
+template void caffe_rng_gaussian<double>(const int n, const double mu,
+    const double sigma, double* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
+template void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = static_cast<unsigned int>(variate_generator());
+  }
+}
+
+template void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+template void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
+//
+template <>
+float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+template <>
+double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
+}
+
+template <>
+int caffe_cpu_hamming_distance<float>(const int n, const float* x,
+    const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(
+        static_cast<uint32_t>(x[i]) ^ static_cast<uint32_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+int caffe_cpu_hamming_distance<double>(const int n, const double* x,
+    const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(
+        static_cast<uint64_t>(x[i]) ^ static_cast<uint64_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+float caffe_cpu_asum<float>(const int n, const float* x) {
+  return cblas_sasum(n, x, 1);
+}
+
+template <>
+double caffe_cpu_asum<double>(const int n, const double* x) {
+  return cblas_dasum(n, x, 1);
+}
+
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
+
+template <>
+void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
+    float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
+    double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
+}
+
+#ifndef CPU_ONLY
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
+//  - (x[index] < Dtype(0)));
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
+
 template <>
 void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
@@ -183,20 +572,6 @@ cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
   return event;
 }
 
-template <>
-void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const float alpha, const float* A, const float* x,
-    const float beta, float* y) {
-  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
-}
-
-template <>
-void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
-    const int N, const double alpha, const double* A, const double* x,
-    const double beta, double* y) {
-  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
-}
-
 template <>
 void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
     const int N, const float alpha, const float* A, size_t offA, int lda,
@@ -221,7 +596,6 @@ void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
       clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
           offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
           incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
-
 }
 
 template <>
@@ -248,18 +622,6 @@ void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
           &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
-template <>
-void caffe_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) {
-  cblas_saxpy(N, alpha, X, 1, Y, 1);
-}
-
-template <>
-void caffe_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) {
-  cblas_daxpy(N, alpha, X, 1, Y, 1);
-}
-
 template <>
 void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
     float* Y) {
@@ -277,348 +639,94 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
 }
 
 template <>
-void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
-}
-
-template <>
-void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
-}
-
-template <>
-void caffe_gpu_abs<float>(const int n, const float* x, float* y) {
-  caffe_gpu_abs_ocl(n, x, y);
-}
-
-template <>
-void caffe_gpu_abs<double>(const int n, const double* x, double* y) {
-  caffe_gpu_abs_ocl(n, x, y);
-}
-
-template <>
-void caffe_set(const int N, const float alpha, float* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(float) * N);
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
-}
-
-template <>
-void caffe_set(const int N, const double alpha, double* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(double) * N);
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
-}
-
-template <>
-void caffe_add_scalar(const int N, const float alpha, float* Y) {
-  for (int i = 0; i < N; ++i) {
-    Y[i] += alpha;
-  }
-}
-
-template <>
-void caffe_add_scalar(const int N, const double alpha, double* Y) {
-  for (int i = 0; i < N; ++i) {
-    Y[i] += alpha;
-  }
-}
-
-template <>
-void caffe_copy<float>(const int N, const float* X, float* Y) {
-  cblas_scopy(N, X, 1, Y, 1);
-}
-
-template <>
-void caffe_copy<double>(const int N, const double* X, double* Y) {
-  cblas_dcopy(N, X, 1, Y, 1);
-}
-
-//template <typename Dtype>
-void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) {
-  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
-      NULL, NULL);
-// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
-}
-/*
- template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
- template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
- template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
- template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
- */
-template <>
-void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y) {
-  OCL_CHECK(
-      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
-          N, 0, NULL, NULL));
-}
-
-template <>
-void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y) {
-  OCL_CHECK(
-      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
-          N, 0, NULL, NULL));
-}
-
-template <>
-void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-  if (X != Y) {
-    CLBLAS_CHECK(
-        clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-            &(amdDevice.CommandQueue), 0, NULL, NULL));
-  }
-}
-
-template <>
-void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-  if (X != Y) {
-    CLBLAS_CHECK(
-        clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-            &(amdDevice.CommandQueue), 0, NULL, NULL));
-  }
-}
-
-template <>
-void caffe_scal<float>(const int N, const float alpha, float *X) {
-  cblas_sscal(N, alpha, X, 1);
-}
-
-template <>
-void caffe_scal<double>(const int N, const double alpha, double *X) {
-  cblas_dscal(N, alpha, X, 1);
-}
-
-template <>
-void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
-  CLBLAS_CHECK(
-      clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-          NULL, NULL));
-}
-
-template <>
-void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
-  CLBLAS_CHECK(
-      clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
-          NULL, NULL));
-}
-
-template <>
-void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  caffe_gpu_scal<float>(N, beta, Y);
-  caffe_gpu_axpy<float>(N, alpha, X, Y);
-}
-
-template <>
-void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  caffe_gpu_scal<double>(N, beta, Y);
-  caffe_gpu_axpy<double>(N, alpha, X, Y);
-}
-
-template <>
-void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
-    const float beta, float* Y) {
-  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
-}
-
-template <>
-void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
-    const double beta, double* Y) {
-  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
-}
-
-template <>
-void caffe_add<float>(const int n, const float* a, const float* b, float* y) {
-  vsAdd(n, a, b, y);
-}
-
-template <>
-void caffe_add<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdAdd(n, a, b, y);
-}
-
-template <>
-void caffe_sub<float>(const int n, const float* a, const float* b, float* y) {
-  vsSub(n, a, b, y);
-}
-
-template <>
-void caffe_sub<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdSub(n, a, b, y);
-}
-
-template <>
-void caffe_mul<float>(const int n, const float* a, const float* b, float* y) {
-  vsMul(n, a, b, y);
-}
-
-template <>
-void caffe_mul<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdMul(n, a, b, y);
-}
-
-template <>
-void caffe_div<float>(const int n, const float* a, const float* b, float* y) {
-  vsDiv(n, a, b, y);
-}
-
-template <>
-void caffe_div<double>(const int n, const double* a, const double* b,
-    double* y) {
-  vdDiv(n, a, b, y);
-}
-
-template <>
-void caffe_powx<float>(const int n, const float* a, const float b, float* y) {
-  vsPowx(n, a, b, y);
-}
-
-template <>
-void caffe_powx<double>(const int n, const double* a, const double b,
-    double* y) {
-  vdPowx(n, a, b, y);
-}
-
-template <>
-void caffe_sqr<float>(const int n, const float* a, float* y) {
-  vsSqr(n, a, y);
-}
-
-template <>
-void caffe_sqr<double>(const int n, const double* a, double* y) {
-  vdSqr(n, a, y);
-}
-
-template <>
-void caffe_exp<float>(const int n, const float* a, float* y) {
-  vsExp(n, a, y);
+void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
 }
 
 template <>
-void caffe_exp<double>(const int n, const double* a, double* y) {
-  vdExp(n, a, y);
+void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
 }
 
-unsigned int caffe_rng_rand() {
-  return (*caffe_rng())();
+template <>
+void caffe_gpu_abs<float>(const int n, const float* x, float* y) {
+  caffe_gpu_abs_ocl(n, x, y);
 }
 
-template <typename Dtype>
-Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter < Dtype
-      > (b, std::numeric_limits < Dtype > ::max());
+template <>
+void caffe_gpu_abs<double>(const int n, const double* x, double* y) {
+  caffe_gpu_abs_ocl(n, x, y);
 }
 
-template
-float caffe_nextafter(const float b);
 
-template
-double caffe_nextafter(const double b);
-
-template <typename Dtype>
-void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_LE(a, b);
-  boost::uniform_real < Dtype
-      > random_distribution(a, caffe_nextafter<Dtype>(b));
-  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> > variate_generator(
-      caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
-  }
-
-  //LOG(INFO) << "caffe_rng_uniform";
+//template <typename Dtype>
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) {
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
+      NULL, NULL);
+// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
+}
+/*
+ template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
+ template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
+ template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
+ template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
+ */
+template <>
+void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
 }
 
-template
-void caffe_rng_uniform<float>(const int n, const float a, const float b,
-    float* r);
-
-template
-void caffe_rng_uniform<double>(const int n, const double a, const double b,
-    double* r);
+template <>
+void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
+}
 
-template <typename Dtype>
-void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma,
-    Dtype* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GT(sigma, 0);
-  boost::normal_distribution < Dtype > random_distribution(a, sigma);
-  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> > variate_generator(
-      caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
+template <>
+void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
   }
 }
 
-template
-void caffe_rng_gaussian<float>(const int n, const float mu, const float sigma,
-    float* r);
-
-template
-void caffe_rng_gaussian<double>(const int n, const double mu,
-    const double sigma, double* r);
-
-template <typename Dtype>
-void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GE(p, 0);
-  CHECK_LE(p, 1);
-  boost::bernoulli_distribution < Dtype > random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
-      caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = variate_generator();
+template <>
+void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
   }
 }
 
-template
-void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
-
-template
-void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
-
-template <typename Dtype>
-void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
-  CHECK_GE(n, 0);
-  CHECK(r);
-  CHECK_GE(p, 0);
-  CHECK_LE(p, 1);
-  boost::bernoulli_distribution < Dtype > random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
-      caffe_rng(), random_distribution);
-  for (int i = 0; i < n; ++i) {
-    r[i] = static_cast<unsigned int>(variate_generator());
-  }
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
+  CLBLAS_CHECK(
+      clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
 }
 
-template
-void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
+  CLBLAS_CHECK(
+      clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
+}
 
-template
-void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
-//
 template <>
-float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
+void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  caffe_gpu_scal<float>(N, beta, Y);
+  caffe_gpu_axpy<float>(N, alpha, X, Y);
 }
 
 template <>
-double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
+void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+  caffe_gpu_scal<double>(N, beta, Y);
+  caffe_gpu_axpy<double>(N, alpha, X, Y);
 }
 
 template <>
@@ -653,38 +761,6 @@ void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
   clReleaseMemObject(d_out);
 }
 
-template <>
-int caffe_cpu_hamming_distance<float>(const int n, const float* x,
-    const float* y) {
-  int dist = 0;
-  for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcount(
-        static_cast<uint32_t>(x[i]) ^ static_cast<uint32_t>(y[i]));
-  }
-  return dist;
-}
-
-template <>
-int caffe_cpu_hamming_distance<double>(const int n, const double* x,
-    const double* y) {
-  int dist = 0;
-  for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcountl(
-        static_cast<uint64_t>(x[i]) ^ static_cast<uint64_t>(y[i]));
-  }
-  return dist;
-}
-
-template <>
-float caffe_cpu_asum<float>(const int n, const float* x) {
-  return cblas_sasum(n, x, 1);
-}
-
-template <>
-double caffe_cpu_asum<double>(const int n, const double* x) {
-  return cblas_dasum(n, x, 1);
-}
-
 template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
   cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
@@ -713,27 +789,7 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
   clReleaseMemObject(d_y);
 }
 
-//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-//  - (x[index] < Dtype(0)));
-//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
-
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign);
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit);
-INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
-
-template <>
-void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
-    float* y) {
-  cblas_scopy(n, x, 1, y, 1);
-  cblas_sscal(n, alpha, y, 1);
-}
 
-template <>
-void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
-    double* y) {
-  cblas_dcopy(n, x, 1, y, 1);
-  cblas_dscal(n, alpha, y, 1);
-}
 
 template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
@@ -902,48 +958,6 @@ void caffe_gpu_log<double>(const int N, const double* a, double* y) {
   kernel_log(N, a, y);
 }
 
-template <>
-void caffe_log<float>(const int n, const float* a, float* y) {
-  vsLn(n, a, y);
-}
-
-template <>
-void caffe_log<double>(const int n, const double* a, double* y) {
-  vdLn(n, a, y);
-}
-
-template <typename Dtype>
-void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
-  if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
-#else
-      NO_GPU;
-#endif
-    } else {
-      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
-  }
-}
-
-template void caffe_copy<int>(const int N, const int* X, int* Y);
-template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-    unsigned int* Y);
-template void caffe_copy<float>(const int N, const float* X, float* Y);
-template void caffe_copy<double>(const int N, const double* X, double* Y);
-
-template <>
-void caffe_abs<float>(const int n, const float* a, float* y) {
-  vsAbs(n, a, y);
-}
-
-template <>
-void caffe_abs<double>(const int n, const double* a, double* y) {
-  vdAbs(n, a, y);
-}
-
 template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
     float* y) {
@@ -957,32 +971,5 @@ void caffe_gpu_add<double>(const int N, const double* a, const double* b,
   // NOLINT_NEXT_LINE(whitespace/operators)
   kernel_add(N, a, b, y);
 }
-
-template <>
-float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-    const float* y, const int incy) {
-  return cblas_sdot(n, x, incx, y, incy);
-}
-
-template <>
-double caffe_cpu_strided_dot<double>(const int n, const double* x,
-    const int incx, const double* y, const int incy) {
-  return cblas_ddot(n, x, incx, y, incy);
-}
-
-template <typename Dtype>
-void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
-  if (alpha == 0) {
-    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
-  }
-}
-
-template void caffe_set<int>(const int N, const int alpha, int* Y);
-template void caffe_set<float>(const int N, const float alpha, float* Y);
-template void caffe_set<double>(const int N, const double alpha, double* Y);
-
+#endif
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 6b5045d8..1123e2b3 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -32,6 +32,9 @@
 #include "caffe/common.hpp"
 #include "caffe/util/ocl_util.hpp"
 namespace caffe {
+
+#ifndef CPU_ONLY
+
 template <typename dtype> extern std::string get_dtype_suffix();
 
 template <typename Dtype>
@@ -88,4 +91,5 @@ void eventCallback(cl_event event, cl_int event_status, void* user_data) {
   printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
 }
 
+#endif
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index fcc2479e..53ebb751 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -33,6 +33,8 @@
 #include "caffe/util/ocl_util.hpp"
 #include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
+
+#ifndef CPU_ONLY
 typedef unsigned int uint32_t;
 struct array4x32 {
     uint32_t v[4];
@@ -1929,5 +1931,8 @@ template void ocl_conv<double>(double* bottom_data, double* top_data,
     double* weights, double* bias, int channel_in, int width, int height,
     int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
     int stride, int pad, int batch_sz);
+
+#endif
+
 }  // namespace caffe
 

From fd94a965ea9d478e84e9c63eb66a850b5302b8f0 Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Sun, 13 Sep 2015 18:36:44 -0700
Subject: [PATCH 102/124] add find path for AMDAPPSDK3.0 and addes
 src/caffe/CMakeLists.txt

---
 cmake/Modules/FindOpenCL.cmake |  2 +-
 src/caffe/CMakeLists.txt       | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 src/caffe/CMakeLists.txt

diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
index 7c23701d..93abd4f9 100644
--- a/cmake/Modules/FindOpenCL.cmake
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -75,7 +75,7 @@ if( LIB64 )
             $ENV{AMDAPPSDKROOT}/lib
             $ENV{CUDA_PATH}/lib
         DOC "OpenCL dynamic library path"
-        PATH_SUFFIXES x86_64 x64
+        PATH_SUFFIXES x86_64 x64 x86_64/sdk
         PATHS
             /usr/lib
             /usr/local/cuda/lib
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
new file mode 100644
index 00000000..3e675c20
--- /dev/null
+++ b/src/caffe/CMakeLists.txt
@@ -0,0 +1,34 @@
+# generate protobuf sources
+file(GLOB proto_files proto/*.proto)
+caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
+
+# include python files either to force generation
+add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend!
+caffe_default_properties(proto)
+
+# --[ Caffe library
+
+# creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists
+caffe_pickup_caffe_sources(${PROJECT_SOURCE_DIR})
+
+if(HAVE_CUDA)
+  caffe_cuda_compile(cuda_objs ${cuda})
+  list(APPEND srcs ${cuda_objs} ${cuda})
+endif()
+
+add_library(caffe ${srcs})
+target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
+caffe_default_properties(caffe)
+
+# ---[ Tests
+ add_subdirectory(test)
+
+# ---[ Install
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
+install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
+install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+
+file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
+list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
+install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)

From 1a2f0222dc7f384800c94d72493f42e62e01b0f8 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Mon, 14 Sep 2015 11:23:08 +0800
Subject: [PATCH 103/124] Add the change in tools/

---
 tools/caffe.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index d7953bdd..79b8e127 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -16,7 +16,9 @@ using caffe::Layer;
 using caffe::shared_ptr;
 using caffe::Timer;
 using caffe::vector;
+#ifndef CPU_ONLY
 using caffe::amdDevice;
+#endif
 
 DEFINE_int32(gpu, -1,
     "Run in GPU mode on given device ID.");
@@ -247,9 +249,9 @@ int time() {
   std::vector<double> backward_time_per_layer(layers.size(), 0.0);
   double forward_time = 0.0;
   double backward_time = 0.0;
-
+#ifndef CPU_ONLY
   clFinish(amdDevice.CommandQueue);
-
+#endif
   for (int j = 0; j < FLAGS_iterations; ++j) {
     Timer iter_timer;
     iter_timer.Start();
@@ -257,9 +259,9 @@ int time() {
     for (int i = 0; i < layers.size(); ++i) {
       timer.Start();
       layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
-
+#ifndef CPU_ONLY
       clFinish(amdDevice.CommandQueue);
-
+#endif
       forward_time_per_layer[i] += timer.MicroSeconds();
     }
     forward_time += forward_timer.MicroSeconds();
@@ -268,9 +270,9 @@ int time() {
       timer.Start();
       layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
                           bottom_vecs[i]);
-      
+#ifndef CPU_ONLY
       clFinish(amdDevice.CommandQueue);
-      
+#endif
       backward_time_per_layer[i] += timer.MicroSeconds();
     }
     backward_time += backward_timer.MicroSeconds();

From a3d5b15e52514d8a21b7dfcd03b4d9c89cc4a00e Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Mon, 14 Sep 2015 11:48:06 +0800
Subject: [PATCH 104/124] Clean test code

---
 src/caffe/test/test_caffe_main.cpp          | 16 ----------------
 src/caffe/test/test_inner_product_layer.cpp |  8 --------
 2 files changed, 24 deletions(-)

diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 278d520c..32643b3b 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -5,16 +5,6 @@
 #include "caffe/common.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
-namespace caffe {
-#ifndef CPU_ONLY
-  //cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
-}
-
-#ifndef CPU_ONLY
-//using caffe::CAFFE_TEST_CUDA_PROP;
-
-#endif
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
@@ -22,22 +12,16 @@ int main(int argc, char** argv) {
 #ifndef CPU_ONLY
   // Before starting testing, let's first print out a few cuda defice info.
   int device = 0;
-//  cudaGetDeviceCount(&device);
- // cout << "Cuda number of devices: " << device << endl;
   if (argc > 1) {
     // Use the given device
     device = atoi(argv[1]);
-   // cudaSetDevice(device);
     caffe::amdDevice.Init(device);
     cout << "Setting to use device " << device << endl;
   } else if (OPENCL_TEST_DEVICE >= 0) {
     // Use the device assigned in build configuration; but with a lower priority
     device = OPENCL_TEST_DEVICE;
   }
-//  cudaGetDevice(&device);
   cout << "Current device id: " << device << endl;
- // cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
-//  caffe::set_mode(caffe::GPU);
   caffe::amdDevice.Init();
 #endif
   // invoke the test.
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index f0c36b13..7913b49c 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -57,10 +57,6 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
 
 TYPED_TEST(InnerProductLayerTest, TestForward) {
   typedef typename TypeParam::Dtype Dtype;
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
- // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
   if (Caffe::mode() == Caffe::CPU ||
       sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;
@@ -87,10 +83,6 @@ TYPED_TEST(InnerProductLayerTest, TestForward) {
 
 TYPED_TEST(InnerProductLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
- // IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
   if (Caffe::mode() == Caffe::CPU ||
       sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;

From aef701ce1d71601c539bbb2d67f8683f6f5d21c7 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 16 Sep 2015 00:23:50 +0800
Subject: [PATCH 105/124] Passed PReLU layer's unit test

---
 src/caffe/ocl/prelu_layer.cl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
index 5e8c521f..caff18b9 100644
--- a/src/caffe/ocl/prelu_layer.cl
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -48,13 +48,13 @@ template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLU
 template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
 
 template <class T>
-__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_out, __global T* in_data, const int offset_in, __global T* out_diff) {
+__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_in_diff, __global T* in_data, const int offset_in_data, __global T* out_diff) {
   int index = get_global_id(0);
   if(index < count) {
-    in_diff += offset_out;
-    out_diff += offset_in;
+    in_diff += offset_in_diff;
+    in_data += offset_in_data;
     out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
   }
 }
-template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_out, __global float* in_data, const int offset_in, __global float* out_diff);
-template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_out, __global double* in_data, const int offset_in, __global double* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_in_diff, __global float* in_data, const int offset_in_data, __global float* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_in_diff, __global double* in_data, const int offset_in_data, __global double* out_diff);

From c1102c3fd0e9df307c2a21a0f9d1864d6a896193 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 16 Sep 2015 01:10:30 +0800
Subject: [PATCH 106/124] Passed through Slice layer

---
 include/caffe/util/ocl_wrapper.hpp |  6 +++++
 src/caffe/layers/slice_layer.cpp   | 36 ++++++++++++++++++++++++++++--
 src/caffe/ocl/slice_layer.cl       | 28 +++++++++++++++++++++++
 src/caffe/util/ocl_wrapper.cpp     | 35 +++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 src/caffe/ocl/slice_layer.cl

diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index a1d11d18..146567ea 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -340,6 +340,12 @@ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
 template <typename Dtype>
 void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
     const int* mask, Dtype* bottom_diff);
+
+template <typename Dtype>
+void Slice(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, Dtype* out_data);
 #endif
 }
 #endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 8263b92b..cd19fdb5 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -110,17 +110,49 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     offset_slice_axis += top_slice_axis;
   }
 }
+
 #ifndef CPU_ONLY
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
+      const vector<Blob<Dtype>*>& top) {
+  if (top.size() == 1) { return; }
+  int offset_slice_axis = 0;
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = true;
+  for (int i = 0; i < top.size(); ++i) {
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        (nthreads, bottom_data, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0] || top.size() == 1) { return; }
+  int offset_slice_axis = 0;
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = false;
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        (nthreads, top_diff, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
+    offset_slice_axis += top_slice_axis;
+  }
 }
 
+
 #else
 STUB_GPU(SliceLayer);
 #endif
diff --git a/src/caffe/ocl/slice_layer.cl b/src/caffe/ocl/slice_layer.cl
new file mode 100644
index 00000000..26c6bb34
--- /dev/null
+++ b/src/caffe/ocl/slice_layer.cl
@@ -0,0 +1,28 @@
+template <class Dtype>
+__kernel void Slice(const int nthreads, __global const Dtype* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global Dtype* out_data) {
+    int index = get_global_id(0);
+    if (index < nthreads) {
+        const int total_slice_size = slice_size * top_slice_axis;
+        const int slice_num = index / total_slice_size;
+        const int slice_index = index % total_slice_size;
+        const int bottom_index = slice_index +
+            (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
+        if (forward) {
+            out_data[index] = in_data[bottom_index];
+        } else {
+            out_data[bottom_index] = in_data[index];
+        }
+  }
+}
+
+template __attribute__ ((mangled_name(Slice_float))) __kernel void Slice(const int nthreads, __global const float* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global float* out_data);
+template __attribute__ ((mangled_name(Slice_double))) __kernel void Slice(const int nthreads, __global const double* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global double* out_data);
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 53ebb751..8c35e719 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1917,6 +1917,41 @@ template void MaxBackward<float>(const int nthreads, const float* top_diff,
 template void MaxBackward<double>(const int nthreads, const double* top_diff,
     const int blob_idx, const int* mask, double* bottom_diff);
 
+template <typename Dtype>
+void Slice(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, Dtype* out_data) {
+  std::string kernel_name = "Slice" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+  int k_forward = (forward == true) ? 1 : 0;
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_slices);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &slice_size);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &bottom_slice_axis);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &top_slice_axis);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_slice_axis);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Slice<float>(const int nthreads, const float* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, float* out_data);
+template void Slice<double>(const int nthreads, const double* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, double* out_data);
+
 template <typename Dtype>
 void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
     int channel_in, int width, int height, int channel_out, int width_out,

From 8b433d14255da4625a25f3af42ae7b48a79a79d9 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 16 Sep 2015 14:11:48 +0800
Subject: [PATCH 107/124] Passed through Im2col_layer test

---
 src/caffe/layers/im2col_layer.cpp | 1 +
 src/caffe/util/im2col.cpp         | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index f51fd7cc..38e1fd20 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -103,6 +103,7 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index ab023e70..89985534 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -196,8 +196,8 @@ template void im2col_gpu<double>(const double* data_im, const int img_offset,
     const int stride_w, double* data_col, const int col_offset);
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
-    const int width, const int channels, const int patch_h, const int patch_w,
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height,
+    const int width,  const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     Dtype* data_im, const int img_offset) {
   std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
@@ -233,11 +233,11 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
 }
 
 template void col2im_gpu<float>(const float* data_col, const int col_offset,
-    const int height, const int width, const int channels, const int patch_h,
+    const int channels, const int height, const int width, const int patch_h,
     const int patch_w, const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, float* data_im, const int img_offset);
 template void col2im_gpu<double>(const double* data_col, const int col_offset,
-    const int height, const int width, const int channels, const int patch_h,
+    const int channels, const int height, const int width, const int patch_h,
     const int patch_w, const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im, const int img_offset);
 

From 1ec6e88956d065c504e1eaeff1e5a8ba0c4dcb4d Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Tue, 15 Sep 2015 23:12:05 -0700
Subject: [PATCH 108/124] Update README.md

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index dd3933e6..56786f8f 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,10 @@ We will keep updating the latest performance as we make optimizations. Fury resu
 For more information on how to install, use or contribute to this code base, please visit our wiki page:
  https://github.com/amd/OpenCL-caffe/wiki
 
+#Contributors
+Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu
+We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. 
+
 #Support needed
  As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together.
 

From c5eeb40515d1a9fa8fdaf7d8ab413f0ab7fa97b8 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Wed, 16 Sep 2015 22:23:39 +0800
Subject: [PATCH 109/124] Tested reduction_layer & deconv_layer

---
 include/caffe/util/math_functions.hpp |  19 ++++-
 include/caffe/util/ocl_util.hpp       |   2 +-
 include/caffe/util/ocl_wrapper.hpp    |   3 +
 src/caffe/layers/deconv_layer.cpp     |  18 ++--
 src/caffe/layers/reduction_layer.cpp  |  25 +++---
 src/caffe/ocl/util.cl                 |  21 ++++-
 src/caffe/util/math_functions.cpp     | 117 ++++++++++++++++++++++++--
 src/caffe/util/ocl_util.cpp           |  11 ++-
 src/caffe/util/ocl_wrapper.cpp        |  22 +++++
 9 files changed, 197 insertions(+), 41 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 7178ea74..4ca1fac0 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -107,7 +107,7 @@ template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype *X);
 
 template <typename Dtype>
-void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X, const int offset=0);
 
 inline void caffe_memset(const size_t N, const int alpha, void* X) {
   memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
@@ -127,6 +127,9 @@ void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y);
 template <typename Dtype>
 void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
 
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype* X, const int offx, Dtype* Y, const int offy);
+
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
@@ -141,7 +144,7 @@ template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
 template <typename Dtype>
-void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, const int offx = 0);
 
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
@@ -222,6 +225,9 @@ Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 template <typename Dtype>
 void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
+template <typename Dtype>
+void caffe_gpu_dot(const int n, const Dtype* x, size_t offx, const Dtype* y, size_t offy, Dtype* out);
+
 template <typename Dtype>
 int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
@@ -236,6 +242,9 @@ Dtype caffe_cpu_asum(const int n, const Dtype* x);
 template <typename Dtype>
 void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, size_t offx, Dtype* y);
+
 // the branchless, type-safe version from
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
 template <typename Dtype>
@@ -282,6 +291,9 @@ DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 template <typename Dtype>
 void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sign(const int N, const Dtype *X, const int offx, Dtype *Y, const int offy);
+
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro
 using std::signbit;
@@ -301,6 +313,9 @@ void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, const int offx, Dtype* y, const int offy);
+
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index 00bfa3cf..dcdf1057 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -30,7 +30,7 @@
 namespace caffe {
 #ifndef CPU_ONLY
 template <typename Dtype>
-void ocl_memset(Dtype* buffer, const Dtype value, const int count);
+void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
     const int count);
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 146567ea..61d6162e 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -229,6 +229,9 @@ void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
 template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
 
+template <typename Dtype>
+void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx,  Dtype * Y, const int offy);
+
 template <typename Dtype>
 void kernel_channel_max(const int num, const int channels,
     const int spatial_dim, const Dtype* data, Dtype* out);
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 2504f43a..5b0eeb03 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -79,10 +79,11 @@ void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->bottom_offset_ = bottom[i]->offset(n);
-      this->top_offset_ = top[i]->offset(n);
+      this->bottom_offset_ = top[i]->offset(n);
+      this->top_offset_ = bottom[i]->offset(n);
       this->backward_gpu_gemm(bottom_data, weight, top_data);
       if (this->bias_term_) {
+        this->top_offset_ = top[i]->offset(n);
         const Dtype* bias = this->blobs_[1]->gpu_data();
         this->forward_gpu_bias(top_data, bias);
       }
@@ -104,23 +105,20 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
         this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+        this->backward_gpu_bias(bias_diff, top_diff);
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
-        this->top_offset_ = top[i]->offset(n);
-        this->bottom_offset_ = bottom[i]->offset(n);
+        this->top_offset_ = bottom[i]->offset(n);
+        this->bottom_offset_ = top[i]->offset(n);
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
+          this->weight_gpu_gemm(top_diff, bottom_data, weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
+          this->forward_gpu_gemm(top_diff, weight, bottom_diff);
         }
       }
     }
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 9ec057b1..0358d83a 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -128,29 +128,32 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
+ //Forward_cpu(bottom, top);
+//return;
   const Dtype* bottom_data = bottom[0]->gpu_data();
   const Dtype* mult_data = NULL;
   if (sum_multiplier_.count() > 0) {
     mult_data = sum_multiplier_.gpu_data();
   }
   Dtype* top_data = top[0]->mutable_cpu_data();
+  size_t bottom_offset = 0;
   for (int i = 0; i < num_; ++i) {
     switch (op_) {
     case ReductionParameter_ReductionOp_SUM:
     case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
+    caffe_gpu_dot(dim_, mult_data, 0, bottom_data, bottom_offset, top_data);
       break;
     case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
+      caffe_gpu_asum(dim_, bottom_data, bottom_offset, top_data);
       break;
     case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
+      caffe_gpu_dot(dim_, bottom_data, bottom_offset, bottom_data, bottom_offset, top_data);
       break;
     default:
       LOG(FATAL) << "Unknown reduction op: "
           << ReductionParameter_ReductionOp_Name(op_);
     }
-    bottom_data += dim_;
+    bottom_offset += dim_;
     ++top_data;
   }
   if (coeff_ != Dtype(1)) {
@@ -184,26 +187,28 @@ void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int bottom_data_offset = 0;
+  int bottom_diff_offset = 0;
   for (int i = 0; i < num_; ++i) {
     const Dtype bottom_coeff = (*top_diff) * coeff_;
     switch (op_) {
     case ReductionParameter_ReductionOp_SUM:
     case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
+      caffe_gpu_set(dim_, bottom_coeff, bottom_diff, bottom_diff_offset);
       break;
     case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
+      caffe_gpu_sign(dim_, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset);
+      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff, bottom_diff_offset);
       break;
     case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
+      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset);
       break;
     default:
       LOG(FATAL) << "Unknown reduction op: "
           << ReductionParameter_ReductionOp_Name(op_);
     }
-    bottom_data += dim_;
-    bottom_diff += dim_;
+    bottom_data_offset += dim_;
+    bottom_diff_offset += dim_;
     ++top_diff;
   }
 }
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index 576a6e98..eced284b 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -27,16 +27,17 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
 template <class T>
-__kernel void OCL_memset(__global T* buffer, const T value, const int size) {
+__kernel void OCL_memset(__global T* buffer, const T value, const int size, const int buf_offset) {
   int gdx = get_global_id(0);
+  buffer += buf_offset;
   if(gdx < size) {
     buffer[gdx] = value;
   }
 }
 
-template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size);
-template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size);
-template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size);
+template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size, const int buf_offset);
+template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size, const int buf_offset);
+template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size, const int buf_offset);
 
 __kernel void OCL_memset2(__global int* buffer, const int value, const int size) {
   int gdx = get_global_id(0);
@@ -56,6 +57,18 @@ __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) {
 template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
 template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
 
+template <class T>
+__kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx,  __global T* Y, const int offy) {
+  X += offx;
+  Y += offy;
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
+  }
+}
+template __attribute__((mangled_name(caffe_gpu_sign_with_offset_float))) __kernel void caffe_gpu_sign_with_offset(const int N, __global float* X, const int offx,  __global float* Y, const int offy);
+template __attribute__((mangled_name(caffe_gpu_sign_with_offset_double))) __kernel void caffe_gpu_sign_with_offset(const int N, __global double* X, const int offx,  __global double* Y, const int offy);
+
 template <class T>
 __kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) {
   int index = get_global_id(0);
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index d1cfc954..aebeb5ed 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -702,16 +702,34 @@ void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
 }
 
 template <>
-void caffe_gpu_scal<float>(const int N, const float alpha, float *X) {
+void caffe_gpu_copy<float>(const int N, const float* X, const int offx, float* Y, const int offy) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasScopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
+}
+
+template <>
+void caffe_gpu_copy<double>(const int N, const double* X, const int offx, double* Y, const int offy) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasDcopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
+}
+
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float *X, const int offx) {
   CLBLAS_CHECK(
-      clblasSscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+      clblasSscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0,
           NULL, NULL));
 }
 
 template <>
-void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
+void caffe_gpu_scal<double>(const int N, const double alpha, double *X, const int offx) {
   CLBLAS_CHECK(
-      clblasDscal(N, alpha, (cl_mem) X, 0, 1, 1, &(amdDevice.CommandQueue), 0,
+      clblasDscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0,
           NULL, NULL));
 }
 
@@ -761,6 +779,36 @@ void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
   clReleaseMemObject(d_out);
 }
 
+template <>
+void caffe_gpu_dot<float>(const int n, const float* x, size_t offx, const float* y, size_t offy, float* out) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(float)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(float)), NULL, NULL);
+  clblasSdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
+template <>
+void caffe_gpu_dot<double>(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) {
+  //need to pass in scratchBuff
+  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(double)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(double)), NULL, NULL);
+  clblasDdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
 template <>
 void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
   cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
@@ -789,6 +837,33 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
   clReleaseMemObject(d_y);
 }
 
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, size_t offx, float* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_float)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_float)), NULL, NULL);
+  clblasSasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
+      0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, size_t offx, double* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_double)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_double)), NULL, NULL);
+  clblasDasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
+      y, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
 
 
 template <>
@@ -805,18 +880,32 @@ void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
   caffe_gpu_scal(n, alpha, y);
 }
 
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+    const int offx, float* y, const int offy) {
+  caffe_gpu_copy(n, x, offx, y, offy);
+  caffe_gpu_scal(n, alpha, y, offy);
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+    const int offx, double* y, const int offy) {
+  caffe_gpu_copy(n, x, offx, y, offy);
+  caffe_gpu_scal(n, alpha, y, offy);
+}
+
 template <typename Dtype>
 void set_kernel(const int n, const Dtype alpha, Dtype* y) {
 }
 
 template <>
-void caffe_gpu_set<float>(const int N, const float alpha, float* Y) {
-  ocl_memset(Y, alpha, N);
+void caffe_gpu_set<float>(const int N, const float alpha, float* Y, const int offy) {
+  ocl_memset(Y, alpha, N, offy);
 }
 
 template <>
-void caffe_gpu_set<double>(const int N, const double alpha, double* Y) {
-  ocl_memset(Y, alpha, N);
+void caffe_gpu_set<double>(const int N, const double alpha, double* Y, const int offy) {
+  ocl_memset(Y, alpha, N, offy);
 }
 
 template <>
@@ -844,11 +933,23 @@ void caffe_gpu_sign<float>(const int N, const float *X, float *Y) {
   caffe_gpu_sign_ocl(N, X, Y);
 }
 
+
 template <>
 void caffe_gpu_sign<double>(const int N, const double *X, double *Y) {
   caffe_gpu_sign_ocl(N, X, Y);
 }
 
+template <>
+void caffe_gpu_sign<float>(const int N, const float *X, const int offx, float *Y, const int offy) {
+  caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy);
+}
+
+
+template <>
+void caffe_gpu_sign<double>(const int N, const double *X, const int offx, double *Y, const int offy) {
+  caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy);
+}
+
 template <>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
     float* y) {
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 1123e2b3..0b151e5a 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -38,13 +38,14 @@ namespace caffe {
 template <typename dtype> extern std::string get_dtype_suffix();
 
 template <typename Dtype>
-void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
+void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset) {
   std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
   cl_int err = 0;
   err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
   err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value);
   err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+  err |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &buf_offset);
   OCL_CHECK(err);
 
   size_t Global_Work_Size[1] = { (size_t) count };
@@ -55,11 +56,9 @@ void ocl_memset(Dtype* buffer, const Dtype value, const int count) {
 
 }
 
-template void ocl_memset<int>(int* buffer, const int value, const int count);
-template void ocl_memset<float>(float* buffer, const float value,
-    const int count);
-template void ocl_memset<double>(double* buffer, const double value,
-    const int count);
+template void ocl_memset<int>(int* buffer, const int value, const int count, const int buf_offset);
+template void ocl_memset<float>(float* buffer, const float value, const int count, const int buf_offset);
+template void ocl_memset<double>(double* buffer, const double value, const int count, const int buf_offset);
 
 void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
     const int count) {
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 8c35e719..7ffadc72 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1584,6 +1584,28 @@ template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
 template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
     double* Y);
 
+template <typename Dtype>
+void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx,  Dtype * Y, const int offy) {
+  std::string kernel_name = "caffe_gpu_sign_with_offset" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offx);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &Y);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offy);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_with_offset_ocl<float>(const int N, const float* X, const int offx, float* Y, const int offy);
+template void caffe_gpu_sign_with_offset_ocl<double>(const int N, const double* X, const int offx, double* Y, const int offy);
+
+
 template <typename Dtype>
 void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
   std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();

From 6a46781617065b3be1c5ed7aa404c0898d15d436 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Wed, 16 Sep 2015 10:25:27 -0700
Subject: [PATCH 110/124] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 56786f8f..a1bf49d6 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ For more information on how to install, use or contribute to this code base, ple
 
 #Contributors
 Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu
+
 We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. 
 
 #Support needed

From b0cb051c6944b9e4e5fe8c61c85c646aa1f82d3b Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Wed, 16 Sep 2015 11:17:14 -0700
Subject: [PATCH 111/124] update gitignore

---
 .gitignore | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 434c7112..5a2ad423 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,5 +92,6 @@ LOG*
 CURRENT
 MANIFEST-*
 
-#log files
-log
+#cmakefiles
+src/caffe/test/CMakeFiles
+src/caffe/CMakeFiles

From e7db7b1cd7939f12af265f45b8b88aaa03672319 Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Wed, 16 Sep 2015 11:20:53 -0700
Subject: [PATCH 112/124] untrack auto generated src/caffe/test/CMakeFiles

---
 .../CMakeDirectoryInformation.cmake           |   16 -
 ...le_generated_test_im2col_kernel.cu.o.cmake |  296 ---
 ...e_generated_test_im2col_kernel.cu.o.depend |    1 -
 src/caffe/test/CMakeFiles/progress.marks      |    1 -
 .../CMakeFiles/runtest.dir/DependInfo.cmake   |   27 -
 .../test/CMakeFiles/runtest.dir/build.make    |   69 -
 .../CMakeFiles/runtest.dir/cmake_clean.cmake  |    8 -
 .../test/CMakeFiles/runtest.dir/progress.make |    1 -
 .../test.testbin.dir/DependInfo.cmake         |   92 -
 .../CMakeFiles/test.testbin.dir/build.make    | 1623 -----------------
 .../test.testbin.dir/cmake_clean.cmake        |   68 -
 .../CMakeFiles/test.testbin.dir/depend.make   |    2 -
 .../CMakeFiles/test.testbin.dir/flags.make    |    8 -
 .../test/CMakeFiles/test.testbin.dir/link.txt |    1 -
 .../CMakeFiles/test.testbin.dir/progress.make |   60 -
 15 files changed, 2273 deletions(-)
 delete mode 100644 src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
 delete mode 100644 src/caffe/test/CMakeFiles/progress.marks
 delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/build.make
 delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/runtest.dir/progress.make
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/build.make
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
 delete mode 100644 src/caffe/test/CMakeFiles/test.testbin.dir/progress.make

diff --git a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake b/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 7bb0014c..00000000
--- a/src/caffe/test/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# Relative path conversion top directories.
-SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-
-# Force unix paths in dependencies.
-SET(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
deleted file mode 100644
index 895d9fca..00000000
--- a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
+++ /dev/null
@@ -1,296 +0,0 @@
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
-
-# The MIT License
-#
-# License for the specific language governing rights and limitations under
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-
-##########################################################################
-# This file runs the nvcc commands to produce the desired output file along with
-# the dependency file needed by CMake to compute dependencies.  In addition the
-# file checks the output of each command and if the command fails it deletes the
-# output files.
-
-# Input variables
-#
-# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
-#                          ON : Describe each step
-#
-# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
-#                               RelWithDebInfo, but it should match one of the
-#                               entries in CUDA_HOST_FLAGS. This is the build
-#                               configuration used when compiling the code.  If
-#                               blank or unspecified Debug is assumed as this is
-#                               what CMake does.
-#
-# generated_file:STRING=<> File to generate.  This argument must be passed in.
-#
-# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
-#                                                   in if build_cubin is true.
-
-if(NOT generated_file)
-  message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(CMAKE_COMMAND "/usr/bin/cmake") # path
-set(source_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_kernel.cu") # path
-set(NVCC_generated_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.NVCC-depend") # path
-set(cmake_dependency_file "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.depend") # path
-set(CUDA_make2cmake "/usr/share/cmake-2.8/Modules/FindCUDA/make2cmake.cmake") # path
-set(CUDA_parse_cubin "/usr/share/cmake-2.8/Modules/FindCUDA/parse_cubin.cmake") # path
-set(build_cubin OFF) # bool
-set(CUDA_HOST_COMPILER "/usr/bin/cc") # bool
-# We won't actually use these variables for now, but we need to set this, in
-# order to force this file to be run again if it changes.
-set(generated_file_path "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.") # path
-set(generated_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o") # path
-set(generated_cubin_file_internal "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt") # path
-
-set(CUDA_NVCC_EXECUTABLE "/usr/local/cuda/bin/nvcc") # path
-set(CUDA_NVCC_FLAGS -gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-Xcudafe;--diag_suppress=cc_clobber_ignored;-Xcudafe;--diag_suppress=integer_sign_change;-Xcudafe;--diag_suppress=useless_using_declaration;-Xcudafe;--diag_suppress=set_but_not_used;-Xcompiler;-fPIC;-Xcompiler;-fPIC ;; ) # list
-# Build specific configuration flags
-set(CUDA_NVCC_FLAGS_DEBUG  ; )
-set(CUDA_NVCC_FLAGS_RELEASE  ; )
-set(CUDA_NVCC_FLAGS_MINSIZEREL  ; )
-set(CUDA_NVCC_FLAGS_RELWITHDEBINFO  ; )
-set(nvcc_flags -m64;-DGTEST_USE_OWN_TR1_TUPLE) # list
-set(CUDA_NVCC_INCLUDE_ARGS "-I/usr/local/cuda/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src;-I/usr/local/include;-I/usr/include;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe/include;-I/usr/local/cuda/include;-I/usr/local/include/opencv;-I/usr/include/atlas;-I/home/yugao/caffe-merge-junli/caffe-yb/caffe") # list (needs to be in quotes to handle spaces properly).
-set(format_flag "-c") # string
-
-if(build_cubin AND NOT generated_cubin_file)
-  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
-endif()
-
-# This is the list of host compilation flags.  It C or CXX should already have
-# been chosen by FindCUDA.cmake.
-set(CMAKE_HOST_FLAGS  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized )
-set(CMAKE_HOST_FLAGS_DEBUG -g)
-set(CMAKE_HOST_FLAGS_RELEASE -O3 -DNDEBUG)
-set(CMAKE_HOST_FLAGS_MINSIZEREL -Os -DNDEBUG)
-set(CMAKE_HOST_FLAGS_RELWITHDEBINFO -O2 -g -DNDEBUG)
-
-# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
-set(nvcc_host_compiler_flags "")
-# If we weren't given a build_configuration, use Debug.
-if(NOT build_configuration)
-  set(build_configuration Debug)
-endif()
-string(TOUPPER "${build_configuration}" build_configuration)
-#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
-foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
-  set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
-endforeach()
-if (nvcc_host_compiler_flags)
-  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
-endif()
-#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
-# Add the build specific configuration flags
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
-
-# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
-list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
-list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
-    set(CCBIN -ccbin "${CCBIN}")
-  else()
-    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
-  endif()
-endif()
-
-# cuda_execute_process - Executes a command with optional command echo and status message.
-#
-#   status  - Status message to print if verbose is true
-#   command - COMMAND argument from the usual execute_process argument structure
-#   ARGN    - Remaining arguments are the command with arguments
-#
-#   CUDA_result - return value from running the command
-#
-# Make this a macro instead of a function, so that things like RESULT_VARIABLE
-# and other return variables are present after executing the process.
-macro(cuda_execute_process status command)
-  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
-    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-  endif()
-  if(verbose)
-    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-    # Now we need to build up our command string.  We are accounting for quotes
-    # and spaces, anything else is left up to the user to fix if they want to
-    # copy and paste a runnable command line.
-    set(cuda_execute_process_string)
-    foreach(arg ${ARGN})
-      # If there are quotes, excape them, so they come through.
-      string(REPLACE "\"" "\\\"" arg ${arg})
-      # Args with spaces need quotes around them to get them to be parsed as a single argument.
-      if(arg MATCHES " ")
-        list(APPEND cuda_execute_process_string "\"${arg}\"")
-      else()
-        list(APPEND cuda_execute_process_string ${arg})
-      endif()
-    endforeach()
-    # Echo the command
-    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
-  endif()
-  # Run the command
-  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
-endmacro()
-
-# Delete the target file
-cuda_execute_process(
-  "Removing ${generated_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-  )
-
-# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
-# for dependency generation and hope for the best.
-set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-set(CUDA_VERSION 6.5)
-if(CUDA_VERSION VERSION_LESS "3.0")
-  cmake_policy(PUSH)
-  # CMake policy 0007 NEW states that empty list elements are not
-  # ignored.  I'm just setting it to avoid the warning that's printed.
-  cmake_policy(SET CMP0007 NEW)
-  # Note that this will remove all occurances of -G.
-  list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
-  cmake_policy(POP)
-endif()
-
-# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
-# can cause incorrect dependencies when #including files based on this macro which is
-# defined in the generating passes of nvcc invokation.  We will go ahead and manually
-# define this for now until a future version fixes this bug.
-set(CUDACC_DEFINE -D__CUDACC__)
-
-# Generate the dependency file
-cuda_execute_process(
-  "Generating dependency file: ${NVCC_generated_dependency_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  -M
-  ${CUDACC_DEFINE}
-  "${source_file}"
-  -o "${NVCC_generated_dependency_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${depends_CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file.  Don't put the
-# quotes just around the filenames for the input_file and output_file variables.
-# CMake will pass the quotes through and not be able to find the file.
-cuda_execute_process(
-  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-  COMMAND "${CMAKE_COMMAND}"
-  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
-  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-  -P "${CUDA_make2cmake}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-cuda_execute_process(
-  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-cuda_execute_process(
-  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
-  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
-  )
-
-if(CUDA_result)
-  message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the code
-cuda_execute_process(
-  "Generating ${generated_file}"
-  COMMAND "${CUDA_NVCC_EXECUTABLE}"
-  "${source_file}"
-  ${format_flag} -o "${generated_file}"
-  ${CCBIN}
-  ${nvcc_flags}
-  ${nvcc_host_compiler_flags}
-  ${CUDA_NVCC_FLAGS}
-  -DNVCC
-  ${CUDA_NVCC_INCLUDE_ARGS}
-  )
-
-if(CUDA_result)
-  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
-  cuda_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-  message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-  if(verbose)
-    message("Generated ${generated_file} successfully.")
-  endif()
-endif()
-
-# Cubin resource report commands.
-if( build_cubin )
-  # Run with -cubin to produce resource usage report.
-  cuda_execute_process(
-    "Generating ${generated_cubin_file}"
-    COMMAND "${CUDA_NVCC_EXECUTABLE}"
-    "${source_file}"
-    ${CUDA_NVCC_FLAGS}
-    ${nvcc_flags}
-    ${CCBIN}
-    ${nvcc_host_compiler_flags}
-    -DNVCC
-    -cubin
-    -o "${generated_cubin_file}"
-    ${CUDA_NVCC_INCLUDE_ARGS}
-    )
-
-  # Execute the parser script.
-  cuda_execute_process(
-    "Executing the parser script"
-    COMMAND  "${CMAKE_COMMAND}"
-    -D "input_file:STRING=${generated_cubin_file}"
-    -P "${CUDA_parse_cubin}"
-    )
-
-endif()
diff --git a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend b/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
deleted file mode 100644
index 8e3a0be1..00000000
--- a/src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
+++ /dev/null
@@ -1 +0,0 @@
-#FindCUDA.cmake generated file.  Do not edit.
diff --git a/src/caffe/test/CMakeFiles/progress.marks b/src/caffe/test/CMakeFiles/progress.marks
deleted file mode 100644
index 573541ac..00000000
--- a/src/caffe/test/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-0
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
deleted file mode 100644
index f660fadf..00000000
--- a/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# The set of languages for which implicit dependencies are needed:
-SET(CMAKE_DEPENDS_LANGUAGES
-  )
-# The set of files for implicit dependencies of each language:
-
-# Preprocessor definitions for this target.
-SET(CMAKE_TARGET_DEFINITIONS
-  "GTEST_USE_OWN_TR1_TUPLE"
-  )
-
-# Targets to which this target links.
-SET(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# The include file search paths:
-SET(CMAKE_C_TARGET_INCLUDE_PATH
-  "src"
-  "/usr/local/include"
-  "include"
-  "/usr/local/cuda/include"
-  "/usr/local/include/opencv"
-  "/usr/include/atlas"
-  "."
-  )
-SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/build.make b/src/caffe/test/CMakeFiles/runtest.dir/build.make
deleted file mode 100644
index 7ccc5279..00000000
--- a/src/caffe/test/CMakeFiles/runtest.dir/build.make
+++ /dev/null
@@ -1,69 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# Utility rule file for runtest.
-
-# Include the progress variables for this target.
-include src/caffe/test/CMakeFiles/runtest.dir/progress.make
-
-src/caffe/test/CMakeFiles/runtest:
-	/home/yugao/caffe-merge-junli/caffe-yb/caffe/test/test.testbin --gtest_shuffle
-
-runtest: src/caffe/test/CMakeFiles/runtest
-runtest: src/caffe/test/CMakeFiles/runtest.dir/build.make
-.PHONY : runtest
-
-# Rule to build all files generated by this target.
-src/caffe/test/CMakeFiles/runtest.dir/build: runtest
-.PHONY : src/caffe/test/CMakeFiles/runtest.dir/build
-
-src/caffe/test/CMakeFiles/runtest.dir/clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/runtest.dir/cmake_clean.cmake
-.PHONY : src/caffe/test/CMakeFiles/runtest.dir/clean
-
-src/caffe/test/CMakeFiles/runtest.dir/depend:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/runtest.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : src/caffe/test/CMakeFiles/runtest.dir/depend
-
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
deleted file mode 100644
index ed560e60..00000000
--- a/src/caffe/test/CMakeFiles/runtest.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(REMOVE_RECURSE
-  "CMakeFiles/runtest"
-)
-
-# Per-language clean rules from dependency scanning.
-FOREACH(lang)
-  INCLUDE(CMakeFiles/runtest.dir/cmake_clean_${lang}.cmake OPTIONAL)
-ENDFOREACH(lang)
diff --git a/src/caffe/test/CMakeFiles/runtest.dir/progress.make b/src/caffe/test/CMakeFiles/runtest.dir/progress.make
deleted file mode 100644
index 8b137891..00000000
--- a/src/caffe/test/CMakeFiles/runtest.dir/progress.make
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
deleted file mode 100644
index d4748b21..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake
+++ /dev/null
@@ -1,92 +0,0 @@
-# The set of languages for which implicit dependencies are needed:
-SET(CMAKE_DEPENDS_LANGUAGES
-  "CXX"
-  )
-# The set of files for implicit dependencies of each language:
-SET(CMAKE_DEPENDS_CHECK_CXX
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
-  )
-SET(CMAKE_CXX_COMPILER_ID "GNU")
-
-# Preprocessor definitions for this target.
-SET(CMAKE_TARGET_DEFINITIONS
-  "GTEST_USE_OWN_TR1_TUPLE"
-  )
-
-# Targets to which this target links.
-SET(CMAKE_TARGET_LINKED_INFO_FILES
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/caffe.dir/DependInfo.cmake"
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/proto.dir/DependInfo.cmake"
-  )
-
-# The include file search paths:
-SET(CMAKE_C_TARGET_INCLUDE_PATH
-  "src"
-  "/usr/local/include"
-  "include"
-  "/usr/local/cuda/include"
-  "/usr/local/include/opencv"
-  "/usr/include/atlas"
-  "."
-  )
-SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make b/src/caffe/test/CMakeFiles/test.testbin.dir/build.make
deleted file mode 100644
index c67def36..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/build.make
+++ /dev/null
@@ -1,1623 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# Include any dependencies generated for this target.
-include src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
-
-# Include the progress variables for this target.
-include src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-
-src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.depend
-src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/CMakeFiles/cuda_compile.dir/cuda_compile_generated_test_im2col_kernel.cu.o.cmake
-src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o: src/caffe/test/test_im2col_kernel.cu
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --blue --bold "Building NVCC (Device) object src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -E make_directory /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//.
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir && /usr/bin/cmake -D verbose:BOOL=$(VERBOSE) -D build_configuration:STRING=Release -D generated_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o -D generated_cubin_file:STRING=/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//./cuda_compile_generated_test_im2col_kernel.cu.o.cubin.txt -P /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir//cuda_compile_generated_test_im2col_kernel.cu.o.cmake
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o: src/caffe/test/test_spp_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_2)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp > CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_spp_layer.cpp -o CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o: src/caffe/test/test_filler.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_3)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filler.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filler.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp > CMakeFiles/test.testbin.dir/test_filler.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filler.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filler.cpp -o CMakeFiles/test.testbin.dir/test_filler.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o: src/caffe/test/test_im2col_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_4)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp > CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_im2col_layer.cpp -o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o: src/caffe/test/test_common.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_5)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_common.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_common.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp > CMakeFiles/test.testbin.dir/test_common.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_common.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_common.cpp -o CMakeFiles/test.testbin.dir/test_common.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o: src/caffe/test/test_infogain_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_6)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_infogain_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o: src/caffe/test/test_math_functions.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_7)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_math_functions.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp > CMakeFiles/test.testbin.dir/test_math_functions.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_math_functions.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_math_functions.cpp -o CMakeFiles/test.testbin.dir/test_math_functions.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o: src/caffe/test/test_euclidean_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_8)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_euclidean_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o: src/caffe/test/test_split_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_9)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_split_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp > CMakeFiles/test.testbin.dir/test_split_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_split_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_split_layer.cpp -o CMakeFiles/test.testbin.dir/test_split_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o: src/caffe/test/test_reshape_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_10)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp > CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reshape_layer.cpp -o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o: src/caffe/test/test_random_number_generator.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_11)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp > CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_random_number_generator.cpp -o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o: src/caffe/test/test_lrn_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_12)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp > CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_lrn_layer.cpp -o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o: src/caffe/test/test_gradient_based_solver.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_13)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp > CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_gradient_based_solver.cpp -o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o: src/caffe/test/test_upgrade_proto.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_14)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp > CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_upgrade_proto.cpp -o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o: src/caffe/test/test_io.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_15)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_io.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_io.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp > CMakeFiles/test.testbin.dir/test_io.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_io.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_io.cpp -o CMakeFiles/test.testbin.dir/test_io.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o: src/caffe/test/test_accuracy_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_16)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp > CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_accuracy_layer.cpp -o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o: src/caffe/test/test_caffe_main.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_17)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp > CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_caffe_main.cpp -o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o: src/caffe/test/test_net.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_18)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_net.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_net.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp > CMakeFiles/test.testbin.dir/test_net.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_net.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_net.cpp -o CMakeFiles/test.testbin.dir/test_net.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o: src/caffe/test/test_filter_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_19)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp > CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_filter_layer.cpp -o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o: src/caffe/test/test_power_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_20)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_power_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp > CMakeFiles/test.testbin.dir/test_power_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_power_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_power_layer.cpp -o CMakeFiles/test.testbin.dir/test_power_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o: src/caffe/test/test_softmax_with_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_21)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_with_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o: src/caffe/test/test_argmax_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_22)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp > CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_argmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o: src/caffe/test/test_solver.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_23)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_solver.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_solver.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp > CMakeFiles/test.testbin.dir/test_solver.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_solver.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_solver.cpp -o CMakeFiles/test.testbin.dir/test_solver.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o: src/caffe/test/test_blob.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_24)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_blob.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_blob.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp > CMakeFiles/test.testbin.dir/test_blob.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_blob.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_blob.cpp -o CMakeFiles/test.testbin.dir/test_blob.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o: src/caffe/test/test_benchmark.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_25)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_benchmark.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp > CMakeFiles/test.testbin.dir/test_benchmark.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_benchmark.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_benchmark.cpp -o CMakeFiles/test.testbin.dir/test_benchmark.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o: src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_26)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o: src/caffe/test/test_multinomial_logistic_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_27)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_multinomial_logistic_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o: src/caffe/test/test_util_blas.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_28)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_util_blas.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp > CMakeFiles/test.testbin.dir/test_util_blas.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_util_blas.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_util_blas.cpp -o CMakeFiles/test.testbin.dir/test_util_blas.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o: src/caffe/test/test_internal_thread.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_29)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp > CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_internal_thread.cpp -o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o: src/caffe/test/test_reduction_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_30)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp > CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_reduction_layer.cpp -o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o: src/caffe/test/test_contrastive_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_31)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_contrastive_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o: src/caffe/test/test_eltwise_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_32)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp > CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_eltwise_layer.cpp -o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o: src/caffe/test/test_maxpool_dropout_layers.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_33)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp > CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_maxpool_dropout_layers.cpp -o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o: src/caffe/test/test_threshold_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_34)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp > CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_threshold_layer.cpp -o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o: src/caffe/test/test_pooling_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_35)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp > CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_pooling_layer.cpp -o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o: src/caffe/test/test_softmax_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_36)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp > CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_softmax_layer.cpp -o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o: src/caffe/test/test_inner_product_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_37)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp > CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_inner_product_layer.cpp -o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o: src/caffe/test/test_flatten_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_38)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp > CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_flatten_layer.cpp -o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o: src/caffe/test/test_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_39)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp > CMakeFiles/test.testbin.dir/test_data_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_data_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o: src/caffe/test/test_syncedmem.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_40)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp > CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_syncedmem.cpp -o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o: src/caffe/test/test_hdf5data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_41)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5data_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o: src/caffe/test/test_deconvolution_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_42)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp > CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_deconvolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o: src/caffe/test/test_neuron_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_43)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp > CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_neuron_layer.cpp -o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o: src/caffe/test/test_concat_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_44)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp > CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_concat_layer.cpp -o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o: src/caffe/test/test_protobuf.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_45)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_protobuf.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp > CMakeFiles/test.testbin.dir/test_protobuf.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_protobuf.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_protobuf.cpp -o CMakeFiles/test.testbin.dir/test_protobuf.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o: src/caffe/test/test_hdf5_output_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_46)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp > CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hdf5_output_layer.cpp -o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o: src/caffe/test/test_memory_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_47)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp > CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_memory_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o: src/caffe/test/test_tanh_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_48)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp > CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_tanh_layer.cpp -o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o: src/caffe/test/test_stochastic_pooling.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_49)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp > CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_stochastic_pooling.cpp -o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o: src/caffe/test/test_dummy_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_50)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp > CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_dummy_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o: src/caffe/test/test_layer_factory.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_51)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp > CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_layer_factory.cpp -o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o: src/caffe/test/test_db.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_52)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_db.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_db.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp > CMakeFiles/test.testbin.dir/test_db.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_db.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_db.cpp -o CMakeFiles/test.testbin.dir/test_db.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o: src/caffe/test/test_mvn_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_53)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp > CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_mvn_layer.cpp -o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o: src/caffe/test/test_convolution_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_54)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp > CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_convolution_layer.cpp -o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o: src/caffe/test/test_slice_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_55)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp > CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_slice_layer.cpp -o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o: src/caffe/test/test_hinge_loss_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_56)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp > CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_hinge_loss_layer.cpp -o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o: src/caffe/test/test_image_data_layer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_57)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp > CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_image_data_layer.cpp -o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o: src/caffe/test/test_platform.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_58)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_platform.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_platform.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp > CMakeFiles/test.testbin.dir/test_platform.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_platform.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_platform.cpp -o CMakeFiles/test.testbin.dir/test_platform.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o: src/caffe/test/test_data_transformer.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_59)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp > CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/test_data_transformer.cpp -o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires:
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
-	$(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides
-
-src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.provides.build: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
-
-# Object files for target test.testbin
-test_testbin_OBJECTS = \
-"CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_filler.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_common.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_math_functions.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_split_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_io.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_net.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_power_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_solver.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_blob.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_benchmark.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_util_blas.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_data_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_protobuf.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_db.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_platform.cpp.o" \
-"CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
-
-# External object files for target test.testbin
-test_testbin_EXTERNAL_OBJECTS = \
-"/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o"
-
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
-test/test.testbin: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/build.make
-test/test.testbin: lib/libgtest.a
-test/test.testbin: lib/libcaffe.so
-test/test.testbin: lib/libproto.a
-test/test.testbin: /usr/local/lib/libboost_system.so
-test/test.testbin: /usr/local/lib/libboost_thread.so
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libpthread.so
-test/test.testbin: /usr/local/lib/libglog.so
-test/test.testbin: /usr/local/lib/libgflags.a
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so
-test/test.testbin: /usr/local/lib/libglog.so
-test/test.testbin: /usr/local/lib/libgflags.a
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libprotobuf.so
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5_hl.so
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libhdf5.so
-test/test.testbin: /usr/local/lib/liblmdb.so
-test/test.testbin: /usr/lib/x86_64-linux-gnu/libleveldb.so
-test/test.testbin: /usr/lib/libsnappy.so
-test/test.testbin: /usr/local/cuda/lib64/libcudart.so
-test/test.testbin: /usr/local/cuda/lib64/libcurand.so
-test/test.testbin: /usr/local/cuda/lib64/libcublas.so
-test/test.testbin: /usr/local/lib/libopencv_highgui.so.2.4.10
-test/test.testbin: /usr/local/lib/libopencv_imgproc.so.2.4.10
-test/test.testbin: /usr/local/lib/libopencv_core.so.2.4.10
-test/test.testbin: /usr/lib/liblapack_atlas.so
-test/test.testbin: /usr/lib/libcblas.so
-test/test.testbin: /usr/lib/libatlas.so
-test/test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX executable ../../../test/test.testbin"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/test.testbin.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-src/caffe/test/CMakeFiles/test.testbin.dir/build: test/test.testbin
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/build
-
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o.requires
-src/caffe/test/CMakeFiles/test.testbin.dir/requires: src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o.requires
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/requires
-
-src/caffe/test/CMakeFiles/test.testbin.dir/clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test && $(CMAKE_COMMAND) -P CMakeFiles/test.testbin.dir/cmake_clean.cmake
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/clean
-
-src/caffe/test/CMakeFiles/test.testbin.dir/depend: src/caffe/test/CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/test.testbin.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/depend
-
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake b/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
deleted file mode 100644
index 3270b673..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-FILE(REMOVE_RECURSE
-  "CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o"
-  "CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_filler.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_common.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_math_functions.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_split_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_io.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_net.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_power_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_solver.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_blob.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_benchmark.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_util_blas.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_data_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_protobuf.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_db.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_platform.cpp.o"
-  "CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o"
-  "../../../test/test.testbin.pdb"
-  "../../../test/test.testbin"
-)
-
-# Per-language clean rules from dependency scanning.
-FOREACH(lang CXX)
-  INCLUDE(CMakeFiles/test.testbin.dir/cmake_clean_${lang}.cmake OPTIONAL)
-ENDFOREACH(lang)
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make b/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
deleted file mode 100644
index e3607644..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for test.testbin.
-# This may be replaced when dependencies are built.
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make b/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
deleted file mode 100644
index 8b4ef992..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/flags.make
+++ /dev/null
@@ -1,8 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# compile CXX with /usr/bin/c++
-CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
-
-CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
-
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt b/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
deleted file mode 100644
index 35426fa4..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/usr/bin/c++    -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG    CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o CMakeFiles/test.testbin.dir/test_filler.cpp.o CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o CMakeFiles/test.testbin.dir/test_common.cpp.o CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_math_functions.cpp.o CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_split_layer.cpp.o CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o CMakeFiles/test.testbin.dir/test_io.cpp.o CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o CMakeFiles/test.testbin.dir/test_net.cpp.o CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o CMakeFiles/test.testbin.dir/test_power_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_solver.cpp.o CMakeFiles/test.testbin.dir/test_blob.cpp.o CMakeFiles/test.testbin.dir/test_benchmark.cpp.o CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_util_blas.cpp.o CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o CMakeFiles/test.testbin.dir/test_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o CMakeFiles/test.testbin.dir/test_protobuf.cpp.o CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o CMakeFiles/test.testbin.dir/test_db.cpp.o CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o CMakeFiles/test.testbin.dir/test_platform.cpp.o CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o CMakeFiles/cuda_compile.dir/./cuda_compile_generated_test_im2col_kernel.cu.o  -o ../../../test/test.testbin  -L/usr/local/cuda/lib64  -L/usr/local/lib -rdynamic ../../../lib/libgtest.a -Wl,--whole-archive ../../../lib/libcaffe.so -Wl,--no-whole-archive ../../../lib/libproto.a /usr/local/lib/libboost_system.so /usr/local/lib/libboost_thread.so -lpthread -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lpthread /usr/local/lib/libglog.so /usr/local/lib/libgflags.a -lprotobuf -lhdf5_hl -lhdf5 -llmdb -lleveldb -lsnappy /usr/local/cuda/lib64/libcudart.so /usr/local/cuda/lib64/libcurand.so /usr/local/cuda/lib64/libcublas.so /usr/local/lib/libopencv_highgui.so.2.4.10 /usr/local/lib/libopencv_imgproc.so.2.4.10 /usr/local/lib/libopencv_core.so.2.4.10 -llapack_atlas -lcblas -latlas -Wl,-rpath,/usr/local/cuda/lib64:/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib:/usr/local/lib 
diff --git a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make b/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
deleted file mode 100644
index 9de70a55..00000000
--- a/src/caffe/test/CMakeFiles/test.testbin.dir/progress.make
+++ /dev/null
@@ -1,60 +0,0 @@
-CMAKE_PROGRESS_1 = 
-CMAKE_PROGRESS_2 = 69
-CMAKE_PROGRESS_3 = 
-CMAKE_PROGRESS_4 = 70
-CMAKE_PROGRESS_5 = 
-CMAKE_PROGRESS_6 = 71
-CMAKE_PROGRESS_7 = 
-CMAKE_PROGRESS_8 = 72
-CMAKE_PROGRESS_9 = 
-CMAKE_PROGRESS_10 = 73
-CMAKE_PROGRESS_11 = 
-CMAKE_PROGRESS_12 = 74
-CMAKE_PROGRESS_13 = 
-CMAKE_PROGRESS_14 = 75
-CMAKE_PROGRESS_15 = 
-CMAKE_PROGRESS_16 = 76
-CMAKE_PROGRESS_17 = 
-CMAKE_PROGRESS_18 = 77
-CMAKE_PROGRESS_19 = 
-CMAKE_PROGRESS_20 = 78
-CMAKE_PROGRESS_21 = 
-CMAKE_PROGRESS_22 = 79
-CMAKE_PROGRESS_23 = 
-CMAKE_PROGRESS_24 = 80
-CMAKE_PROGRESS_25 = 
-CMAKE_PROGRESS_26 = 81
-CMAKE_PROGRESS_27 = 
-CMAKE_PROGRESS_28 = 82
-CMAKE_PROGRESS_29 = 
-CMAKE_PROGRESS_30 = 83
-CMAKE_PROGRESS_31 = 
-CMAKE_PROGRESS_32 = 84
-CMAKE_PROGRESS_33 = 
-CMAKE_PROGRESS_34 = 85
-CMAKE_PROGRESS_35 = 
-CMAKE_PROGRESS_36 = 86
-CMAKE_PROGRESS_37 = 
-CMAKE_PROGRESS_38 = 87
-CMAKE_PROGRESS_39 = 
-CMAKE_PROGRESS_40 = 88
-CMAKE_PROGRESS_41 = 
-CMAKE_PROGRESS_42 = 89
-CMAKE_PROGRESS_43 = 
-CMAKE_PROGRESS_44 = 90
-CMAKE_PROGRESS_45 = 
-CMAKE_PROGRESS_46 = 91
-CMAKE_PROGRESS_47 = 
-CMAKE_PROGRESS_48 = 92
-CMAKE_PROGRESS_49 = 
-CMAKE_PROGRESS_50 = 93
-CMAKE_PROGRESS_51 = 
-CMAKE_PROGRESS_52 = 94
-CMAKE_PROGRESS_53 = 
-CMAKE_PROGRESS_54 = 95
-CMAKE_PROGRESS_55 = 
-CMAKE_PROGRESS_56 = 96
-CMAKE_PROGRESS_57 = 
-CMAKE_PROGRESS_58 = 97
-CMAKE_PROGRESS_59 = 
-

From df57731ace57e2f46ec5b16942694a3f23489f82 Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Wed, 16 Sep 2015 16:44:49 -0700
Subject: [PATCH 113/124] removed unnecessary cmake files

---
 src/caffe/.OCL_kernel.cl.swo                  |  Bin 98304 -> 0 bytes
 src/caffe/Makefile                            | 2279 -----------------
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/gtest.dir/DependInfo.cmake     |   32 -
 src/gtest/CMakeFiles/gtest.dir/build.make     |  106 -
 .../CMakeFiles/gtest.dir/cmake_clean.cmake    |   10 -
 .../gtest.dir/cmake_clean_target.cmake        |    3 -
 src/gtest/CMakeFiles/gtest.dir/depend.make    |    2 -
 src/gtest/CMakeFiles/gtest.dir/flags.make     |    8 -
 src/gtest/CMakeFiles/gtest.dir/link.txt       |    2 -
 src/gtest/CMakeFiles/gtest.dir/progress.make  |    2 -
 src/gtest/CMakeFiles/progress.marks           |    1 -
 src/gtest/Makefile                            |  212 --
 src/gtest/cmake_install.cmake                 |   34 -
 14 files changed, 2707 deletions(-)
 delete mode 100644 src/caffe/.OCL_kernel.cl.swo
 delete mode 100644 src/caffe/Makefile
 delete mode 100644 src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/build.make
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/depend.make
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/flags.make
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/link.txt
 delete mode 100644 src/gtest/CMakeFiles/gtest.dir/progress.make
 delete mode 100644 src/gtest/CMakeFiles/progress.marks
 delete mode 100644 src/gtest/Makefile
 delete mode 100644 src/gtest/cmake_install.cmake

diff --git a/src/caffe/.OCL_kernel.cl.swo b/src/caffe/.OCL_kernel.cl.swo
deleted file mode 100644
index 62349bbdafdf55d217551523bd623664e7967190..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 98304
zcmeI534B~tz4!yDSQHci;UU5W1(P-<&C&(h0Hw5umKLF`v5wPZk_=5UAv5Vh1)mBo
z$mWLno(pdHM8O4pATCc)d?<>jsJI}DiVKMPN7N@E|L^bY_s+eOnWSwHl25;#nS0MY
z_nhC^e!t&2_uO^!PC4%QXyKt<P5eE(sp-+%t~>eIXI;JH<*#aL+B`mxFUj!<{+1qp
z%-j=~pR|1GiOXJGE9ki@!nWRge}7@_aA9nqF!#*yk)fhFvU#2U(pDbpHQ!b*TbWy5
z7#k@J9ojooE9#kZGZL7Qz=RUm_2kZu_IVQQ1qU4%?eUzGC%j@Nz>EZDBrqd^841it
zU`7Hn5}1*|j09#R@WhcoW%RjCuO>TtnJisoK6glb?r45rXg)7aoPSE<{Jtgv8_!b{
z=dVbdf0{Xesrl?MH?-I9Y>vOoeBO{a|8#SFiTT`PC;i1<Kg%5NHlM2#=XWv34>O+^
zCeA-2asI}{`DdE*3(e<06X$m|$B#Cj^Nql4`gSwN+s)_3#QA5L<42j#|4N)Uw`=M#
z=ac1wYe)U<X+D>k0<rght~ox}e4d&(zqdJV(|3O2{O;zwwJ1N5IR6}T{s{B=kHq;s
z%<<LcbCIbKHhnhT3(Wa9B+lRWnErdSIiIY5zfYV`*1zAG^Ou<GllAYn=KKlfe6s%i
z#++Yl&L`{Nz2>~FpUL|7Yjb{)xjtF{ew8?%tbe~W=dFB__3s{YeyNE+S^w@f=aqAS
zFv<FNmpOl=xjtF{?lk95G3S4qsGq+u=WYEsGEx3_nDaJ$??{~gxjDbw#Q)>O`Jb8d
z)&cqaME&@wIdAJzf8xA}cIIP70y7erk-&@uW+X5pff)(RNMJ?+PiP6``<j|W6+Thy
zX^cc`-~S$i^G9$s{0F=qUI8zGPB;MmhSB;<_%3`D-UhFNK{yIpVITMt#_!MJM)(Z8
zA1;K=&=0F&G3)`4VMyNxUxQD=yWl){IlLIU;F<7u4D!3-JMa;B3!DQd!%~<H&xHpt
z?0*X1fcL=#P=;5+@o+f22=;_WaT5Fvz6w{tyWuQY3oD=lc7ea*Y`6!mhb!O`*a9cP
zkuV2#f<NN4xE;O@?}0Z$1y;f$I1qM&hbe&H!VlpF_&B^1O3(wx!U6CM_zQN=U2q+I
z7%qiDSOeYA0z1GTu)lr+pN99s>tPID0t?_E_y<mdd*QqAMfez82CsurI31S5^I;Y|
zL>>7Kd>%ds7eha&y5m~OtXZcWx}m3YZS;!nNWRTmyST|6>yAz=RaWLYJDPQ%*62ij
zZ0gFj=jOH8aP!tibM&>Ha_u=$u|H~FHC7rQ=_^M^L>=>6qAfm3bDNI3ZZj{}?rzf&
zzl}<EdrGo77EgBjT9@p3smX3nO?H=0c2_hvB3g2}z-AhO@pFB-r^{tTCy+x*3^si_
zl+Uh~7NOGb+Uf7=Gv}|amAQqf!cw=HR2eT!6;^r~2g&wT$tRm*@iOW3WwIbO*)bVb
zRn1yrSdg0RbZK`?UfT0hlRbH9&reNuy0qJUvKy5)O?RudsIJsxr%QXDPquGS^Q&5v
zFM8Lad`;5L&^8}cuU(5$U01UEX+O;0wb2&UnJTQwmvLvRuu{u7Om?GX+>x4WFIF!G
z-`Uq{Q5~ttPA%j1MU$6ydup;LFYN}#sP=_E*^Nqj8po*i1wPrnMRirRDBpZsi}GT1
zEy`C0*P_(mR@I_{0o_+6e~(&KX&T3<_8{4fmhm)>QSE^q(x|kjag1u8f<`cnV^n*f
z5%?C>S=FL^^KmW8Hy_ucyl&`PRC;G04CsxP@pO(+L9!cdQPVj_1$s!M(w@dKO6f6T
z|2Jd1eM%<Au>bA+@nhKc_ru%aEI1vO!G7=nHvRYDoAAHz4QPXRV86c|UISv!?*$KI
ztKSCShws4&@NMk!o8dY*9Tvc|;kVe}pMiHk1<r&)7=RnGyCXONo(UIWU%wFsVL2QH
z2f^R5r>}s^;au1Z1JD7_hR3j>Z-(pP^)L=gp&kB--TVN20X`0!U;w&ccX$Tu0#ApX
z;V5kA3U+ZBUJ3ibZ?T8J0b(D&23`rr!v63hWPB@J3)jG>LFC>{y7q+oNyF_RK7dcc
zd*Ds50eazhXga4Uii9XCjaPb#BYlNUVhauDH*LrdEmkM=>Bt<@C!727m3&Y0NHja@
z9n6pLL`6B86ZN+6yF8k&6!Sy5zTz;4${12QGSqsxO@C=DYA%jcqF!Q}x0sJ5E{es`
zoH@j?MFm{rZ#Z|w$~C#;m!76$kc-P)tP&d?9Ny3e7;ypGQUdgo0&>c&N5+TaX|}hQ
zyXZjO(LkY+8yG6BBUi<~X7MaYPI^ltWeG!Se3o0y;We`(5m9Tz7bGjO=FGDv(S}m7
zFVfGU<c#!k$f(&zNUCf^N>oO~txc7}@aRy!QizuH4&}?`XblS>+_79PU#W~0*Ns;S
zxm?uTJe(gH7%KGTM)Jdj=5ndO!cB60rSWw`g_f3<D4rg3K5EXHtVTNI>}YhXv@WMJ
zQ*~~rR4%s?la`LVWGKI`Fx0AKqp~8Ipgu^*p;B*t$ORlLl*flEC-siyE4_orwXwYP
z50&zjda|Mdrsu?*N@T<yohTnBTrwLpD^*2w)0ria<G7+q#sc;X6#MYbNL`v$eM0;<
zRYz%2GPKJj#r|flrwX^XY*7aqh(na&SR5TKjYc85LoAA}hND=NjABv4QM4~e#(~Yz
zXjVF_#L?NfJQh?5WPy>$y8PH!u`t%WY~@RHD_1XDx-xhC^5b(Wmakg2-1H<3lw5m9
zmDD(-v#jzx@!d?Ntg2heV>oxHG|=o?d{2?i(RAb$y7k^%Z@yevLUTT%S+((=;@XzA
zzGqh*dy#rh&0HK6b^j_~sLSQz*#%QmQ*O;zaOgZ;qja^BZj~z}F5mmg5!e4pry?yN
z79!}M6CJMU;C%l^&+IZ9^kE!MbOv%{qK<$tv0RZ;*OyJcuP$zwsJ~{I-4JDu9H!Gv
zbkAkF;6jjP+tW)Y-*6{P0&=Pw%<PC8N~7sflSwp(O$wFTBE64QUgVpcvl>hr^-VI_
zGh+G&2_O|F)C&-fu5oe_y+kVoR9CMX%$6|b0ChO3lq&flQ;Ths>R7xM#_FjJS6iEq
zP;G6aV<fV_`5-r17dO0o<RHo<b+B$ou+-3`P&uRoI+eOw*fg41M>$tjH>-37NkK|o
z6s4(NeL&KXQ3Y!zA*fVA3d~WRfS{(u6lCJnI<P8L+XbqkW_Bs4K`l{}9F;$*f7z8F
zUcxogZOSc3Ge;!7Qz=O4W@{;k=6s`l%~mL0o3e}|ZZKhu6uLEiGOVLn+OnbTi~1gP
zK+<iXsui8_6PYnms)4hBWT3<Xl3`K?S-#Dg)6z66)9_Ifm$9m~rAZ=esA4rJtp=-_
z*#DTZO($TVhV%bo`+pri1@DIQVJ#d7^I=!`KWzSc;1;+7-U{c!DR4B*0h#}QD*Qiu
z0e8Z;;dAf-koo@tybM}E{00BSKk$3F0j`GsfH8Ol91ETB4EP)Vf?vWH;G=LUyb9L9
zQP2ucfj{9p_zB4T|21$qTm)xBFRTWc|K9^1!>{mT_!@iy-UXxZa(FR_|3Q2Y58-FH
z8$J&ofww>rPKG03HtYxwVAKB;J_GNAH-OCl=V28r2C@BjhexsPW&Z!Ga21>nXTce;
z0``Yp;IH@{Zh`CJLvRU*kK!bdHPw5=PH;bkdOLgy-UDxhSHen=wcj%T|15X}pT+Os
zhwx>%5?%`<a0Yb1{;)GVNFBKuu7wZ578n3kcc?z{H!JDf?+P5C|3c2Ot|_=8r*Q(O
zD{w`6cXK<6x?6&QRuo%O;QY<82^Ss8d!?PfIdx97a|&~!(>Q|@Cc9C8^EA%jbWUMT
zbQ))HI;SuvI*l_povCx8hgG#G-+Ww)^3BK1;P^=$*P>GAL_34Y#YVf$X`I0clig^G
zn#LKN&MC}^PU8$t=M?5dr*V+NS?3n;`9W$?Rg3b?$F(Tmd|ZpFcTTi37||PTQPVj_
z1<7u-MNQ`z6(}N&N_!f|sE$C9X;j+NI7Z=6PH9mKt6G$AKCVUi=HptFU)bfwC^fhx
zRr`)$KyS2+r*Vu5lig?;PvaQX5$GX}N;`Xus>l9+54L^<UI|J2|JT^|UxJT;*#EDF
zAvhgYKnFYzb_TKkZ-#5(64(NLZ~`>Lp3nrhW8Z%ru7Y>NS+Ew4g9Bk#_$#*kJ#al-
z0dIvf;S@L$#Qy&`cp5y2ZGR`+1fPWqVH5Ph5@?3qLDv8O5bXN@^Z738|IYwf|Gz)j
z_5ZhE-^=>{4}q-zAAplU*8lGfJHh?f^*@1cz%?Li5-x;s5Ffx{a4?Ad|2u4ZvH!me
zSHf#y1jPRD0I~mfh6l0jZ-#3@?Eftw_Wud61ong`5c~h@Aol;eLDv75L3{wm!(kvk
zfZafz0r)<M{V(hP#r|Ipr$QS%55)ez6I);G|IfnZunGDgX8)@dw7PfEOZ;so<m;^O
zg*X;p#vLrJZ(xz={Gj{uxA6*jt=nWb4QDY|trn-(h$lN(ecr$#(fL9D*=R>Jjq2O{
zpd<3?Tl=<NB|3i!7SS~7Z(*_<E#zs`-{w!jBAQ11ZT=K2qG{CM=BHXj?b~{p=={L8
zXtdv)Mg=ZRcB4&d8Wp(tQ!t69ag^$sf=M)uqf}R*69l7FhgC<YINHbUwUMo16Lke+
zdZSHh8b_%x*^M@-X&j}x0#&3@X;0%Q)fK2RjY@kON2#trCkUEUdsUP2J_XmLyib90
z(hbk+=g!TbJbp<xLsu}SH(JQkI7)@dZnTiM-zX*a|ACl{=U~Ul{QsqV$lvYQ`ZE82
z70CR*JOg0o{}1H5%>Vxtn_iy(zZJv>a3x#@;saO@%RrtB5c~f@?0T{PZ-UQ)JpaE5
z#Qt9bV*l?Bk7C>Z5WWgx|DO+H|DORXV1L*J#Qwhp#Qy&f$o#)N|1Um(Q(+m14`3gV
zH2^;cvHxZMU+n*J$irc9Fo^vx&;E=3|7ExmUJD~2_P;#)FZTb=@F2GR&2TM<{V&h{
zi~WBBbc4+Qix1!_@F#40@d4Zj?}ayk*#F0a*#8HC>;Z5uw*B`(?ElL^?Em#}Dzw4#
zK<xiJvF*kF|14Y%@{B?si2dITV*fviP5*oNF^CVq?g1cvfQ#YHZ~@33Z>PXK5I?|U
z*!X{fpMmTzcQt$*WIq7$2@Jx?&<V0G;3;qq_2zE)FvyyKb3xT5U5zIAUoD2=zslfP
zq0m3JxocBrN3ORrma9b3Sbn6hG>lc*WJ8UwFKkvD3SfPk@AL=mJAIA&PJ?5WY2T*c
z>exNxa&96Yennv{gh_?mo3F$s2a1a$pB%L=DUwsodZ80*icV(9n2*pLm^;@W8kaR?
z<~Ua}G4HY`YY-*lBwG^i7txv}vcW1w6$=aXwTxU0`mo+8q17E-qr_VtEDiOA2g~E5
z;m_hozyFyc36*^x!mSFFdv-;$bxKImy24m%V)5e%C$2g+7e?8fy43cZsEKvemGQBW
zhzCbz$xr1>3Wl0(Jx8=~3I@?MYHwk(8y#w<QF~iB1%qfBwYLRRFo>p6ds~oh5N+#8
zq6Jehh^A3{3zOYwliGgT8=Yi)Z=3~FFo>qn$FN`u2GKP77<gbPW!klUTkq+!U<wA&
zG>%eXvKwtu(>O{k2z;}RnxE4-N(HJ+qtc$vQ7X_00>y7zZ|XBY7}Faq<Y^qG!elpE
z$a<6#`~QVlY3K1Ay4e4AfBn0$?|%$p`=0}wU~l+uZ26PnFWBmT1liN?3-Eck4rFfs
z7ue@_KodNM{rxTYCfo#{gF$#H91aJ;0q}g-58j3yzB4=xK8vk=DQtq(upE}b(a;9>
zVqf0~*T8vj4u~y&Hptw251awVf;>Av7iPgd*vQ`ndB$JX&##B0U^kF`%Wr_quow=3
zr-RIqe*><7ewYWl!OyXGZ-KAFm*I=>1yJ%t1oiy6=MH1fgC`ybFAjI~mWDE(sL?0G
zCw+oPje_TAa`H&Ge`<Ybcpz8m?=KfB;nBfDabPfZWMi?fG8hI~FScrUU^L$s{w(uU
zdtWSojx<LW!f+~Leb5<{VR93}`DxPIB8Iqr*vCKe<G*3FNAX{BM4oJto8f=-$%$sW
zM0hJ*Ldze!Bow0A_PTEKysJLv|Mvk@$l`F1JcX;G<(Z;0g9jSzH9QJF7#)H6A02W?
zw9$PV<iR4HGJSlBjzm8!Y_FA1aUyKQNw6_j9GP=wb-LyJAm<0=d@+7ro|`<Atem+-
zmEz(#bIuGr3+ky#&KBL-VsKXCEq2dqmFt^qf=s57YO~!Op(y)In4HW}sbHfCD-cO^
z;%Z$e?sA@ti)IH!BvHuIazR-k*o|_`N2Fvz)JiNW23-s1Qa9tFL@G)+HAC6F5UAUJ
zmwYLynU8q9L8xy1++dr}L_Da<Pq%tB?BMh;vfBj@#imqFol!}aezff1L>MZUd*;lP
zovp#-I%oy-1hzft)Y+2k)`9dRajuWhVN?~6=WV;IJyh{0B5QU^<}tNIhg5Vo3wsGI
zpzw&wad==`c1Sj<NGrUh7al@(UHF8LjwibHUU?aAZeVmgS1t~WWImW14;FOvtAdW+
zJ{5ji%CXZkpUthEpo{^huJo9mJWP%`B&o3v>W2OF={-0<aosjIv$?>M)6}n)xy|~^
z5{|Xl$1-Iz4g91_qE)tgl8Tw-I*nJ%x)M*RhY9EB=^5qhGvL43zH!|+#XDwl6|qCl
zcww=eYq9}cK4e6-S>a^;xsi7y4;O|R#5?>}30yuq(l-z5tV%$5z}(f>-&!h2sdtDQ
zW$j1f)8Uh0Yh*kn`lyyhRbsLpAx~6OotBK+nU)k&^WrH{2Qw2w4V}fmS!snht7%rU
z=w?l(%)G3qVe!PR$w*jwZ`0J)Rv77vO8t2I3v7s?|3(HP>07jSJ~vPpDX?)58#?LD
zjL@N6$5pG~@!ucKTewT(Lqo-5GPWbCcJe!oE^Ue+ZOTVaF_^7}XaW7L_Pfkg*zYot
zpeHk;*nEfiWhO;NhNX*XICm@3Hz?YAnbS2+<xPK8S<_z|$(ijv=_;ywop3V7_C5V$
zBd9^M_;Y=a?Stghp`uPUvj>%#%hFMFshzxAqPA!Z<tu6?vFuz;>RaSt`H}vph5fi)
zvX6|~qr+uN%SLQJ)eX+hx<SX<t#gCTSvP3kN;i;cxl|F&Yy1YO1)&>?E-5QJ`)LY<
zB!n|*xf7d5%zU3rP`0WWO*=+%>6RvYz4(dM_^7r})3HS@O=ADcJMWeo``^BwO!oA<
z5k3PKW7A&*>)}jT0TCPkk6`0}3*HE)!1G{VcnDkmR#*?ug}vZDZ2B+3f51jK7H-Fm
z7aRWwm<xZycK-wX27V2{f>&d|?~UF5T-XbCguh^ue;+;p@@)G!^ubct1Mb5%zaBQg
zQSb;h`Hdj^^1Tdpft$(mn?U8g9z#7VjFqw5C}N{BGsQj=EwNn5k5v}yA80K4Af2`B
zo~M3@7g@*QVi+_rs81V%K8TH|qBSw7A3g>dgnCPb{{EazL~}cBR@*|(DpRo=8&$r$
zD&sHLH>1YKE;rVXG49l**4C9Dw(Uza+Or^Srk5c0Lj8$F=Z;~?N$29tptK0v+M>c)
z<3$vhg4m#Js#h518(UnJMvUy_o=!5{cDOi#mCbH~${9h2)!rK1B$i`+nJOnquh`)#
zoh{Lk*yxALk-7Fr7-5)QSFGmN8xyy7R$A=N`YLzlja7HvSo_Y%rMEOZ$_|2R+}9DY
zav}<p0Zc^#1X7Zn9JPtq7B#DMIr+yDCoTkIOSTzT1lu?2f<m5wVJ)(k5yV)e&iN4p
zQR)mq%=nOHrDAl}GCu=L)_r9k%Y-2-$<awkTS$?x21-L6#o@rD8VSrNlrnppP*e5y
z3y;=ery3UlTQGVroT_QzfvxCFv+0lf{k)l-Vbav6IB5evR~oHk*#tgBhJ3AoU_1|@
z6w*ZH6CQ@FL+;yB#<4t5Gs2{$R}>`1N*VEC)}^3&DJng&dfPT1AVbXRI#ZS7RUu8v
z|GIABQ1o8NMpyQFvj3(>UGEg@(Sw#69!0~&&n4ewHCaNh71yxoQf(x4MPq8Zy^;2@
zsrwX%6P{wa$jvy&oO@XP2tCA`!n5i}s>j&%hALZh>Faah=(|3r#$PQjHxQ_>frdbT
zzgZlv;o!`%O>rlzh{E4a_2nwn-W@K<w8N$roP<|_y5(bpNJ(FH5S71fZfJsR^g)!<
znpnt*Dk$~E4B?CA2itUQX_CA7kTSNhmzRhl2sx-is-TiAQyWx1io@(BOzv%qagwqU
zA>rnnB+)7~_N-2A5GmD)B-ksoWG5~In~GZ6lH5dzNu;GCHagH;KV}=LizD2YYdnw>
zg1)64+l?Zisz+O(PoeEt$@r{>x$}8d($RXL(F?1uAWTyCsJdkf%1HG+b#BrwmhfI~
z{+?P$HiHL;K6fPqs{xQA_qAH*Fb&%V(hb}Bv&}kBk$Sdbl2|u6VYi+>x?7Qio#Eqh
z;IzqaWXw!=el;+R)>c(A#?ROgA0I#;my(U8H&eW#fQbEnD0b};#{Ta%HtwgH>;EXc
zA6^Hq23ZFn&-4EZ`~H`3EBq6?{^ua;@ZSk#koEVS@O1bQHvXqT-n+LHc7g}6^KXT#
z;bM?A_J_jXvGsohvZnqW@Mbt4UI(v*5&QtM9{v<q1_v--FKgXpo%=O#Ia~rma0EO9
zeuC})U(g4K!tb&BuYpTpAGnEp|2wGsXW93e#~D%hn=YfBp~WgWO)4R3=+(x5z1F*r
z!wyu^&Fa2U59M3MjCZ$Tkfdj{gJV+3u=ky@_QfUY9T$OI8n^bf_M)~(HiI{D>v6DZ
z(AGs>%Rf*x=ITqHM6Loccr(U{hDNkH%+|D_d|R5bHvRPsai;6pN&|9k^LTzF)3TM5
zF_F8urPxRpPj(&h-IlV9rHnqcL33Z3?30yBw!J^iLJ1RQWHNzuo^7kkW28DedKhsI
zR@+<ZF;V`XS;sYJ&dfTlf3bC3+E;-GYO%5ymyGDoIo*}vQRB`KbCO@!oWWu&S<?}F
z=SH@{8TW)6do#PUO&=bnxl6H$lI~;*gAtcA4WK*i>TyV)jSWFj`WQsW(D3Hx<d)cO
z@1Z|7T*U`cZ{pTHhG}`J-o8O$CPs*VNm+(@s3l4n+wAye{lcoTioe;t&drAjzeN&2
zhCu6*G9G0NEN%PYsZ`Uz`j;%1rBJZEg2`e0&0_x#Vc!lJ`(M@q$ln*S^@pJsdf-S{
z42QvO%=-^v`>%&H;Z!&aTYojo1=)9B_T0M_J_xec{&DbB_%3$-n_&&K!(G_*?}jsA
zAGi%0|9sd2WwH0+BzPILVDtYIo8CU(E^G4-hMnLJY<rpS|0rAl>p=FddlVaA=Kkf`
z^o?*9ltATIXLY;%Uze8(wYtLUjB-ZXv8OY-qFOa?s{N4b@4}mqp0J7XGvziMHS=A%
zw#Vu`Cwt^FSMlR5!76hy)VW|Z4_Q;7M;`B6@UsjuzoGBS7d^vm91r#f(+9LZ`BOew
znRV9V<n-9cVK%EK()G!@XUt6r=&O`85e4~kU*{-EYR8U=yAi)Mv8gJWQ3vgSob35h
zDn=cugC7T7{j)b99d*~et)v-b7-V|5Qzv6WaY>Giu7nQL-H;Hr&$5A7a*OrPR0qTU
zWF-}LE{}gfg{jqYdK`@Dx?2nU(sm_Su-Is3IT}_HV}qWCoLP=$#=)S5Fkm!4d6uJj
zW80_!ERzl1Be<<h?g<ADJJvn^yF!~%y213pw!XzqMk0*uwQcblUijGPl@J_1r>cic
z%MctY4Bj3Ok0P-~7L|E9t1&FgO0Oa2aP0e6m2#Meq`sQ=Y_u2h6J(qh`+sjN;Jp*}
z|38@Lm-qYq9b}LHv#|MB!5sJp_Wt+bU9bk81HZ=RzYflY<?sU70e*$ee=~dz#1F6%
z_Jez|{l5yA!G&-hoC~jn6QB!T1hU`W4)7?xfrsHJ`~a_J-v3;X=k^c6_HTk;VC#Ph
z-UKJWt}ud~-v)msZ=ZlSz&cn0D!<v5zg3~+bE7P!*2~G|YpYoaZ7nt(bkGNquXd%m
zTTZP*ZL|x$<@J;LM{8tXPp^YS(O?)z|ELb+JE75%JYpc)3<I`G##|GHdSpkXer}|1
zmPn3RzG3e4jah%k!(N4<+(wgCTuLlSW|d$(%W`#??Znk=7+~ykeYw0X-eyrQl?7hb
z;+8z{-g|AEw?*csxja}{X0Mk|b$ctSQ8D=Sd^rpH)G{~L5GI2=@sDow!DWHH%3eHq
zG6(L~zSl}$^(r)*Iuex;hg=iOWc=PF&&tMyQ?k*)(sUJhC@9}oHY&@un<H4dp>P6)
zwW2h(F+bK98oMS1@x}A&N|j1!I47QmaI9jW3V(VV-&gB8tm2pi!}!AD3Xhw59pB2r
z)pdVwTam(R%`w`94=2?o7MZF?t<LI4C90Ho+cLS&(244D2_B!{6v#YFEsBS#T{2cf
z`d2ERK<-m4pX!+#TS`IYrF)spMxEvL!7(c~2`;DT(I!!V8TwWu<!98E$3gM=#;~0l
zmoGdcggVPE2yKeaYjD;dZOhfZlwQ3ix+uHIN|%0)UPiD*{fJcENDnBFTB>`P$NFpv
zW1-#jj=j14j8F1#UvveT>gkSx;_l%f2}?XMAVE1wP#F#bPq!rB{7K@%h+-kv@;I-)
zoqwQEu4BZJ!O(0vqXu0ZNb)Qrx{i8`F%0iQ%lM}in8l<>o?OtO2gA@oVo)@iW#zmb
z5BS~d9}Tow@?&LqX?K&!pi&H+Am*|=xjm8no^A=@lmo6!*UctY%zSI8L#+z*p`dT8
zN23a_RHfq7pxSy(Jvx(<mI^!>B`qGq6!j>Tib*L_`pVjxlzARR)>sgn4=U+MT$Pe3
zR`2~$k=9;up@QhhV#_kjF3#~K7stnj_S%Cs+C)&{Lp|h)txowKKhURwQ8{g-HX&6n
zJQZ&g_P(0x%2K-Q`LN`Iq*;YQ&AcW33sdz&IW;qO7g7{f4SuHFhoi<<B=-N#*u#&3
z?EkmW*u?iT&;M<B6PyY!fIr{|xC6cm?}fL(rEoFqfL}n?0F1&~*c%)FA?)@az(?R!
zuoND_hW|Xg4PF6@K=#T1Gj{za;4Lr!i(yxIDm=!#|99a1a0wLPFn9*sj}PDjP=r(A
zcvucc!eOuwWKF;x_yL{+DxcYQ|C8;EQ(qh1x9X%yDX>Y_jOB|X$7JjblM*y+F4cCU
zt<O)y-ma?AqOTeAxaySE8|yVQQUsl0Yovs%T_lemgCkItwvNT&#BTDKQXqltpltcY
zu{Fz9akrmw1zx|=z$>A=?83MNyg3k5^q#mr9$qJVw(tn*CVyRXEp3@OP0^iGtN<|q
zddG*$<Lc33X@BKVAvdScdnQhoQ}sp9(i(JzVc<eeXcU<HyC<%B80e$|nspy;x^nq&
zS?X$c#*&RL*!WEAZigzxQ>);r#PrTYBrGX8EDV)>EnQn~_LCQWXyHto1jzA<;#j#7
zWSGKI*$aw<jD^1%u$|V}93taGwGbv_xdhjw8X(D-8!&Br)XwONxp<RNPufoLH=2T>
z;v;F;QVA}tZK}|Iv^zS}QrUJ+--d^aY*VFoXuBFK+itWVi?&+{6#l?MnTp3Z?q%2d
zQ1uLLJXGxg{E4hn1#M^ohrE@79fIrCtm2YZgUi&4!)O#i{~a1_+qH>!>1`c-%Evd+
z^hlao<()@vWqhRmY>|2GC%>T}eo;MLsAfR*sG!5gwJ<s6$9EgT#8B5bZ?AEB%!C?G
z)fE$JKK2Uv45~(>ZqelB-A!`D{;i2jwv~b<_WyoZy+;}Q|7c?Y--b;ud;f`jUx3BX
z1hUt^ytnT}cs~4%x&N=h2+00_N5NmP|9=9XgxA5zuroY{PvHOH+wd(o6@S28*!OqB
zx8XdHz51UCU&FQ^gC+29@E7d(uY%b6{|-6mgh!bB|6lkdtb_gFDR3<|{}{Xi_5^tb
z;C{Fj-VZ}C7ak<vzkxg8hamd_d>G20WHNzAz)ICs3pIB;YE@x=Er~|0mwc3scD6L>
zOZ){=Ek>=I90h_4tfPE1@RUmf5Pue--B%WJtL^)B#ZOF7`1<a@C0oJ3SiY|)JL-v-
zQ|yvre}9kCw8A-5SpcvxHrHUMII_Y<-HAB}pYdQeVVlXy$Vq*~Wn>d>(_=F&0tjX4
zdojQNLE6-W5tCV}Fx$@R2eb8SA}3^L()id_-JnQivqF|z+LadW>3?=KQX3bB#Syyy
z{Wa7Z(}7dV%(jItLfJ&nOmzF51$5j=Roc|uNcS2+N(1$s{SfV<4xdsv#uke)iNuoX
zF7ipGe!CqOO^iONoV1%Du&9H`tCDJs1J$N{QbTJ%+rOC`9{7u5`hWbY>LnijGHJbT
zort>3Z!?rkZ>Zr1H3W07p9zCVwrbLwH}^8G>3XMfmL5ROG3ef8V&gzJ@Je2uk&NU|
z=8Y;P4v_w=hes!v%8+QhAoSXYn6qA=v?Cc^i*0rODqAZ<^{27v{iZ}R^irbg<s)GW
z3h(B$o<u};5;6>19jhuv{aQ@tNSa_mB8*Tpy{6PDp`-J?>+4!#V|%bxzZPMN&fG#d
zbM0-y5+vl4*X(fRJz1-wT9_G@MRma1Qh(xF7f$ABwu5v8sx`3Q%9{x|^Dq;}3uOiq
z<My*!kkyOqr}F%nz%`8m{HnHBlY3?PZ2x8ah$1=uf;=(Y)^aWzzp@_SlsnOg&3hCB
zC*z8hYjVdQw@Qa*FgvxNytYO>l~S~>z&6W#<~NksW3jKefwxD^UAKAe*@dxEG&)vV
z$8OJMNnuu`en6>z;YD!GjQdi2==hss+DtfNW~^BCGR$7H8Z}ADrKl*qrE%Vtt)2JV
z&3PFgvQ!e{{=1+Ngd_D6$#H2rNpwN6|BuGTJqUYO=KtG`z56BR`(@w1jUf8~6d(sn
zU=b{Ue(e8uVb{yPf9F9bHvO~VKJ5DI;35zke}9nu{AJF6H9U-ce;vFB<hlNR;cjgF
z&%n9x5|DS@--vzxT37_~Zu^7aF>L#Pz~A9-Fo11;8)^Rmd=cITJ)rV4f$d)3X{o%v
zn%D1|Ymy7ycy&x=^JpQ}Wo*uPHBR+`X2`Fl1y<>MDyAdqeCvsEY1$i1;A6JIYU>>_
zMF@32Y(Mo2E|9j?u7d+H<1&ms?5o1#PP-2xnYzspM3yvrtPE^7B34hG%Y+j}lA#PH
zLe(yjP)&N07Dz%F*5#om@U`9-l$6&Rt<Og5EM7sBMs3wc{aycW!`I_TLlKqgn&PsS
zxYRgoPm6Ow`CaOMrpHE6haO~nILYUFYz%?F$M{*?lR>qf?UBpnPx9f`)S?}@=W|jX
zxY1YC^-NEb4QoG@b7WwAtl<17BHSWxzLJNc62ZOKReUjZxM}rW{AG1IYxQNePnSxO
zs!hkbGb9xDV!GzpT&8&u?KwX>h2BK}{*j4%h{O{{VU=P1^tbaGI+x-^uEdmNg;gox
z)2rbyjHk=``poNDbh<+Sp-WeW-_WK@DrN&7!tk<&e1zeJF}*tBB{a=2Wy||2f1z!$
zrf#I@)M=}i#ml!9-F|U*RYSv(ogROZ4n<^NH?&~LnaVN&vwGYrXIVO2B3NyW!Rjxz
z?+T0~8jj6?%&Y`w>_s|va;XmByOkOBZz#_@nrgjeF;Pqz>R%#M?ZPy=k1rOP-wVnu
zyF{z5e6kj3s`@DUV8hCzS0ep48;u<+8c`!d1wp1YZL@1=PFC!H%;=^Fn_2Atql{5}
zJ@)<CunZ0W*$Y742Po_QEAUd-8*ax3@Skuj>;#`={$KV6TnY2xH~0nQJ%F!;m%?-4
z2lxp79Y$dtEQAHn4sCEP{({YL6#N5!!S!%1JQZ%lS8yS$fTzJX@e|wxp96U>z(RN-
z+=H*+F1QmGz+1@VZ}AoUM*IbEDlCHi;ft&TxE?+ae?$&S7Df(kfqbmx@}A7${K$Yj
zvp$j^E;OH5Sb6e9y>J*NXthZXl<i83ybL<tSi>{wd1~ieuxFS+G#BY3(MVx{H{@+7
z<jO;((L$yjS}Qvl-B6vmF|DUMGr@^?Ud#!f6?LFGCs=T^X4z2QZZmhX2Y6?x90?`_
zj$prqImv@da9hh_>A1Xn0}Hm^Q{~md+kep_>wU_rPGO-QR13-+6d}3t=9tZS>zYKO
z8`3Az3Vwxtv|rbt>Q!!%7ZaqeS{#fvK{a)6^NmZqp&DvZq3XwE)9I>VkW60#TceE1
z*Va~lzp&Mk*(KE@nNAK;6*3cQTZwHCvaN*uPD?n_|2*8Cm>n+@UK+d=s{h1QMpg76
zd%BXhGUXFbPeuio5!9=jezykKXl(7%fkF*jw{w4S!#+c_f$-*&^jaojsb6DO7gpsn
zWp#2~1Wv>Ej^vtyT*4)t)j=eJEz#kQ2s#UEqv~Z7UhXM{X4>Ktz06EDQ;IAmfGL)k
ziYwMrMrm^mOH{b@0%On>l~vU^>b&|3nv(5WSW;vb7Wy$%!iI`(+HVz~T~5UkLR?Xm
zVsEZDU#=`+z&)bbN+?UmZ3!2g>DXLS>T=D_ZZcPykAA!JLA`4g^>zNTh8%`+u`T4R
z8Tk5rhAyV=CY>t(zm?<EZ{lK#-$cztdC|L-k2VRu)zA1WWv{IXe3o8Zlki;nizns3
z^q1H2V#XV~zoyEE84sAgv@uxwXe&&5z+eqz+6n_Z(%5IF)GFt|8;bf<$0|q_DqGi=
z6#D$83}R-6H|Qrb*Fna1ld3sS*%ZI+WOWX)|9^_jy#l*j?EgKD-7B`g?D>BkHvSek
z8wOw=%!2P=>wg^H1_Q7Hc86Wz8L$h?#{NH+cqQ(Qa2hOy{b66&2lfVeU%*#L!+FpL
zOCf^Cu>bFeufYW%dj$5wI*@$=bFdQjg$MBsTnqBvK=Bov2<`9)K7zl)U*RFR2i^yH
zcquf)&Tub2gCE1yP=-a&0eix;VK>+no(cblKS7>FconRMeL>bF+zwxYx55DY8{97b
z1$ZA^1Pfsg_&dIa{|66%JeP1EoDD7T2tJ7a1$hoZ{-mV!pHuRjTEGgLa%{|7$K+Qu
zm@nsw10$ueLT)I(j_##Z*WuWqCL61scT@VNw1+;~s6xJ!M;U$dL%0-Jr0N4F6Cvxf
zl8>;P^r@SByVV{omCM0u)tX9Yxzt}7&Tq;MaZRp&G^e77=`dP_(ZN;{%2*o5o^Uc&
zJ~KUbe*xZKNhY)gqW_mHDRaK;^_F0}AfIFTwx-#vV&mqqE=*G9ub(?q8o)QSso6=T
zM;;qMgbcO1EYwOljs@c7TzRdRapFM^m5^jY<FBKRCn7i}j=`$h5FK(zEG=kSvD#w0
zC%Lij*}~MPsAlT4%4{-YfxLWdW#(UXpL%d82`^e7G#v7RxOLFLHnl}<lP|?zyW3Wh
z^^0^cIklK=4Ep?bY*M7aRh+mjoM=h}lrm2)9zQ9y9R#9ut-(1kg2m!N3o)zTbA6?X
zuiX65Lig<_lFE&9D%a7II>Q~5m#XBhE)bWR7FQRy*LQJ~l!sHTMVa9Z-n~3cV%O@+
zR%GlLwnYV86UjZC=BawLT9r=q7A^o!p7`!h<vOfsY@HBm!0w*ZpuzlzXms4%9`*7M
z;<T%jC`m1Y<bjT4;TnA)xIt)CXKbnIjr*XWeNPy=M%(-=y4h|ZwJDo6Cxe5NR7(3z
ziqe*k%>8FeRl8PA=}p{tCz7g_6M4alq@`S4B3W*h#8Vkb*%vnivR4TYMzmB(2g&g*
zinPUB6bGbPsA8$BLFvouZWP)sle=ushB5WR6oIHx48$mf8DET$C?&EPlv1*!lhd(D
zXLwa)>(~mcx00rU_(oP!naPSV%axW9KFE`<dBGr^%%tv&%C>Kjf(TlmJ?tEVrtd<5
zu^xKUZnBmzPC;n0)PK9>&??Mf2F)?4S`16U;1_m^bA6|XRJK#67zJa^HtH2s5pU&Q
zu{^GuNL2;(?IY+KWiYSVhud1I+rIt9bO)v7)R#sY9C>C~9_MM2Y6~J!6A2pRBCAy?
zJ)>qGf;wg<Y1D+xRC17E0!=OhV*ksU{||_*iT!`F7@YiFjO~9o><YJI?|&Aqfy?3L
zuov8ejei|n2xr1-I2azr?!OwY0@)YfNZ0`$!p^@PZiWj$_6uAO15kiI=!KtS`(FfS
zz+7mB4`SoL4CGmYKVjGZ5gve_!Rufx%!dO(Z2cd>Rq!GB09+2TufRC=dj;~aAN&>D
z{T6r+oCO0=fIhef`+Y6E9FBuOVz=K4pNF@=C2%o3gw1{moD4@B`~64A^M8yD|3N;L
ze6?sB{NL-^o2}k}f{-5?9n9;opJqgVVZG*TVA|3ohvcvJxEmik^L>4}G7lc-+3MF<
zX(k-?54MevTgHu&%^8z%_0Es-q*rEx<Xj>l&hemAWIKc~P4P49g4Ej3o9k|RlDLev
zHeuRKv#(B?IT5DJ9Fl~oF-RAkD{RWzmU~;Y&V0=*`G(ddOqdPEhw3bFFibu_v#LRT
zowQWC%vvBPW7WW$Il4q`we(iJEon2ZH;{;o(yoyPbwza&Op%6~g)UX!m_r@I==;i7
zu@=KJ(YR6W^$r8pjZGWJZZN(LIs@j6LZ`NF_Y*$c-(dexLkg*VgUWpR-n;b`i7bE0
ztQT@Jqh3g-r+k>MzX)qYN|&HqD-CoDBrBt)G?c2Kk_3q|M{Syd9wDvni1$Ug7WItD
z+6Z-3xiDw>ZA*|xi+1<9&STc(s+5j1=zEa#)~;p^iIBcWs#zdDj%d-P*Ssd(4CQ|{
zHpU#{_=`k0bWT8@Q5`yFIAu5mgBw&;zl00V-Yu9H(tdZPZQ5E~r=}b8ldoeVAE)Nh
zF>2M&O!j*Zp<S_9<tnf4OPNY;4JRT}gl8(QGFReL5)yc_vJ0C-)kMNJk(|0?wS5qH
zR_l<PNwyYSU|HHWtfgefb<<|0Rt8zBd*Z+~dZstx@2HR%leY>c2kdl+1k-=%x0iP<
z2L-4cx!!-IudLexMO&KeoJ5mJrB_c?lgjBCl}XM?_{eIyG`Y18S!+fjFx}nclW{w$
zR-;}sfb@b*cg(8+BzOKE&S#2tKo?)_IW@vaX^M1$rWv~J#w74m+M(@NeH-LRP+QV^
zuBmi9#I72TQxy`91D*M*E;gn2$>^w4`u`>!sr)aZigm+P!V3adr6O$#$M*IagMgmQ
zak;JO)rj@=%!&HGK2zc=^rGIXo?bowk7@4K|MU2``rCxvFZ=!958nrw|6c>$@NC!(
zHe>7G#l?5Rx8Y1U1`dU%!Bb!-I0KvhYHawc;1lp}I0p8Io#75_`cK0r;cD0iWDfrv
zI1^;v{s-_CxB%V=vhM%sAo~P74Iac^KN)0CzzFWbUjGF=gnhmX%!0pSo4*Kl2HErP
zI(R2+fR{i7$v>$KzD<VhUah#GS?WEZw65SoPP|mchs{_blfL4`EsaT2;mqt_U>eA)
zOH?NyHM-V|OH?<a`lht5DPFNwp15j7acJlyY_me>E3lJ`UVoq&%bI1M*DOz35F~4p
ztssaz*=u7p#z(HSp)jT%srDy$L@kySF<}ZK4%g0tUd0`YwY|iH9><K8MsYw|4Ir!K
zC4z-A3UpQ5B~&l^ck;5YAkzoNN6O687y49ZB*HEamWFuOq&};oi&Q34?qZ)9$TW0n
zXEj4DsxPNT9da4<r823Us_3dRYHs5583o3ZJ?d31lNfsd4M%DRnvOi_<}4*?|0&M3
zMp{dKB3cv8cH(gZijz(>JNf*bs|!{2U`>@|h}-5r!(r!JL>;x?3?>s)eiLfKx*t?k
zm)&@^pSWT-Sf@?WYFFvn>320c>|!#~KT&mUS)8b;woZmmYi#rSSv$X~dQMzaS#>X?
z+NJlo$vMT*(#Gt08#x(s1XU%{cC)w%bOD+Al=NrYa5!S4jP{V4vW#jpal%X;a_Q3b
zB$+G|N~J33(CC)}7cn<$wr=89?LVYVph5V*AhZpMA(wD9rgU`@LnGbY#Q8=kaCuI3
z9cKBH>nxQ5yT!4mac+1#+pipiiPxS)-)k#H=wk|NQQQ}9L)Sma5Tql^sTc%_Cp9^W
z`udc}ggFs<?R~A9Xwrck#zsS_4pX0PsykD<vLrIZ!7wxY%&Ic=^)S8p{{BL4V01hu
zvtYUut6e~`Q1NjsUL}en?1P*<)ZtfD)L%Vfx6G;_u|R3*GKC!^%N-$AK~=EssaW5_
zWp+TCLr>6Qe|Y!kR;UjX=D{qY5AxuSkO%WUUJGQ)HF-Q+pUvDDE!nC2iO|fN;DaPu
zWLW5STFBMi#(M{UpRBU?AyQMK%C;&YL5a)!|9p(zi^TS2{(rSGfWLzM|DSL^6hYnr
z*akboec1l87r-Y#o()(A^Pvr%1AoCU@I`nZjKE^(gx}%|_$k~5{{xr7CU`OI1-rrx
z_zFHN{sOoX{tG?|7vm>53%|f9tc5%93w#OQ3niEbhk~r@mv;nS3|l~a0`p-nxEFuG
zjqq+LK^yD>GOzz*xB;#Nnb$uDR>6TF{)0c`Gx!Q@fwiz2j)PX%4Zex|J`PHbNhd<R
zzx7}(t!a>#xw8?v*tyXWtkg3OL>)lu_dO~MYGI`?>h|W(_p+&KZj5#NrP$+cVOUHY
z^<)?mC$Tu9wkC;x<6sdpNT<NBthEV1rNHYq7*X#c_4X~Z<VRg<yN@|@<bJ+?h_+{n
zM@!{Oj$Qd=k-N%vY9^GR#?_uIdgoe|0kn}ofNq7PZq=d$xyW%gOWDk2xpqO_ZEni;
z^yZWc+pMl(BYWA{zC+7LVpYj>cMH6#QikEAn<_V%`Ai(wrfcG$E#zci8$acrJmoVg
zwF)P-_Qg<Rl(#GC<nnWju~#F^wwI;9g3jM$JG#^QQ|8dOpSNsTX?S$JQaFa93&-bF
ze_2pGfhmKEW6LvsH0kcK0lAgcL8OXp3%{FtE`MD?=3sTXvO{yecd(%Qpm<Vk&Pdvn
zoN%}wjX+nASCDI#zLpGETl98xj_t8v4;m@!Eb4=PTLPW^s$m|NOTB)HWJ=WlU9Es7
zgVoNE8RcX7F~?%rQO7cKW3JGd(H%VBtqvIXP>2<(qBf^n<FO)*)P11sb2~}5HKC>I
zZbh$ktlHF;9_=V5Iey2a7HmtAD1SnVw5H-_rl6gJy+-YJVPd*pb+2M;@wRrwn}(r$
z3f2C2b2e??I3?Kz8jH)gs-39~#RZu_Hl}`WYtiUNmLVB^GB#Ubl4S*xvV<V6Wa(yS
z^+v0v$eu<@KV&FhFWw%d-KemfNu<W=YgUVkl7G@7dh9Jkm2uJAOQYBwSc{Z(0Zm)}
zYN@*EdMKIjncLDNk)V>=h38ReWN5QGXfyFdmr`8fXn;sOq3kD)=(0+e?i95Auxd><
z8Mb66qq?iv(xD;s0wrC+^OdNFxyQp>Bc7_?xK>pQtKjPpPUZG0sY`-W+UrvAwPueE
zP*bzFch#SX7n5manED+m{61mZNFf*d|ApAE`(WdW{lCE2syAWhp9>q{#qd1%3AX(O
z@G95<c~}Jdf;<cGF1Qrd!F+fSd;Y^9cK)SsF<c05hTX8~--I3iMi__PLDuwt2mTu_
zgCTes><{0;uD=<c2ZPx41F!~UkH0U&)$kfP77l^u!b8~hAA^s=-Q-W@v0htzCft)+
z0_lF^#1uKt1CErjEQhmnG#8-RSqGsB!QedB8`wxzPwOkQqnlEF*-2M-2iCljqg8u-
zRIivQjaIPuvL~{#GNROuYJDm*A689?Wy2iYnz^vUIm12C_UPsCs4&JqVLQzh|GxX;
zT67-7+cQw?+hiRW^sYtgD^nf;!pw`XJ9vg%dpM%voH^$C84-0+0vPQ^emjqcinT4$
zay9`-8NucMIm%c!9l0ndBc+Ye#saE~M8d$-JR57irDsHsN|be9VH4Lj>-f668AsS}
zop;?!;t!7BMx{$G2-2m(O}ua?vmBY;jaNmCnAGx<o16M4Uv15k@21X8VRem49wTs}
zW_ywVlW0$VD9va;Mcw9X)5(}@OlvukqFOB{EyWYIoC(t#Pj0w`K(!H5Rq??UiUWfc
znt-Y#bL3;QZ5%F;l8Fwpnd&P+Z9`!}tG?8<5z)>hj?FY07q^yLWSWkgraG&SgH52I
zwz^iN>%DEXwmAnOZNpG8#n;-Y4pOH>-zm7v-oH@E4Gfjm@q7_WY}=LpfgVIED<P!8
zRWHJguXJ-YGFjnjHkfVJK)eG;T!~q&Bp#}^o5Kz{);AY_n!=t<aXs2oN|#L;!_+0)
z+zF{u5qC4$q0*d7cSqXf+kA^pa&di4J}%yjb{`K^y9`WMV3QIr0&_B@1Z)o?%hrMd
z2rt;?@>e%Y7r#E0E`5_mn`13}d>ne1w}XA$cBkPo-4et{Z)GEpsVwW5ptDu;Kh8Np
zxL{Vt^b!&qfAY!|>$A*^AnC?zQkSa#Y%R~ey9sPlN>;POM%^Z~m4LpHEqixi%=aQs
z21pf1KkKs_$Uq(#sY-b~Qo9f4_V&<7j%P~QP5$~yyp2j(?El@d^z!%s#Qwj?Sbeu*
z?|&EG4ljr0a14kaU|)C?oBv^uy#YQ4Z-o-9gcf)j{0bky-EbFN1gFEPAbSM701kpb
z;|ur-h;QH`*by$nH}G~i2l_#tA6O1cp&1T_J>W6c1l$kz!Y|=_@FlnsJ_hfE*MYoe
zpag5-IFNS@%!e1iv)~bY5B~?hfgghGHz>~|ybt~p-T<=K;7eg4JPRJiC-H0e9()Ym
z0k476VL7zH^WZ`J6?Yonh3rLe32cS}bVCd51%JSo@e}wOd<xzR7r;j7gH>=iJO}=X
zZ{zoH1Bl<_U2q<}99|4v@Jx_*B+4EJx4;!3`x2fDr@+zB3j4sH@QwT&Zi36<Lf8Pk
zZ~}D0Y}gxifIm=IZiG+6dtn?@{Ymo}%$jxDp&NQS*G8}Cj^tYhYNb2Y9i3RJtju+G
zH0waE(TV)n)Rmi;6Kze0>sTw()H;AY*6!k)=i=j7Jihj|L45OEeDhp<?Q-oo$_u?}
zjF*x2m7^n~MdAe2ckZGqB3ACsWW7r?Ki8Lg{C#!$IMfv;MThEihxA<ur38??=|X=~
zpEWP4T3K6|Dk_z=gy?;eDUP_PI=!eC=F&u^GEyVejTZ2N)Kq(Mc`5i**OJ15)KsSy
z@Pf(9dVXrECok*ysi{tt^?aY|MrGZVn(E2Rn&w$=W9ss$_Kk@L)Dw-#w;b1)V#(8u
z$+rjBm{JS4)8EurBY$zNYSfu3s>v5{XR4@D3phx1qXpcNn(CMYP48P@W9mpvb*ij8
zCNJyu)KpJi*6pdOPL(z8HQUEGD(meyGLb}4zx~j}GQcL=rw8MU9#&<TvcroUp_=O>
zN}_VawHj3m<o!8n(2|jPy&9u~b}18y?ij%klMj`=(zh*|uenkmQwcIx%8}%izO+@V
zRQJ@!xKbUGl*y6gmA)pEEptzOj4Rb4xl$cLj_1s7Lv-={iYHk2L&{gI?yZjzgE}NJ
zs3XLHyO=GOxD#8Bio3Xr4{;UGYMb3;;^o|?_M?w!howY~n`xH(75P5SkIs&q93b*8
zJZvhLvQ(qcKeoAR6DNBsV+1&DanqKj;s`G=EijjvJRLuDVzYY+Onh3VqO`^Xsc0$9
z#NqXY&ACdnzGZvfEAebuerRKUGaFT_%*w9HO|oLUuh7q$g|YG8N;KBq(U~6`%PaZB
zb2YBgkH@C&D;mB)2~O?FG;6F-86O*onm0vDme3%hL!!?1mX_#Ma^#33SU}#{fvUKw
z#heI=&=$N?blm}VSd5pRD+(KAmQ{}JfvEbL2}ODiwa+NkXm-@!_C-!BMs?&s$2=DT
zElLi|cOg*I<dS)<E(j`%99igspcBb~b{_=&fFscY7X$@_106vujEi!l(*<GRm22kv
zSQvyj((Z#W%E>hgeGqzl3Btn`CWj0u5+q2=JRM}73qt=bK^C;q%nHnrsY;fUZ#moC
z!~%)f20pD4ZgEXH+8_#lgnV5klyZt}VaelHO!<tLr5*EfVxC2a%BAtKUY_5zcY0Ad
z;*tEQI6SH!S(0_zBV4zxFi;#Bk#`uE`qkA#CH=<{c}hiIx=|X=4HQNSto5d6GCRCr
z(~gei2Zr;}>Jyf)TDCG;e%hMlt4=y@^{S|MD3>4Z%Z-ka#{TH=h$kc1>qb5Q|1<2*
zk7KKf{Xf^(aDTwYzY^XI6?hpO4=;t6z&Eh(m%>pn7k-B=e-((GFM9zV2hFe-+>cFv
z4ak0gmw?RwzX{$5V()(mTYdrj8C(2Ma0h%G<b41yhC@K?^j~6k-vAeZtQ`=${PnOS
zNcwZ|3-WRYd<$L=$3Z7N8~%vxd<*;!Tn0T5!M^Z_*vlaM0X_tOfxF>jAbSBWgdIWb
z>94>?;A~h8JHS`4i$4snhjBOn9>y-d3w{beg4^H+unu;IU-7)aC*eYnKke<4|J`h{
zag<K)wiYH4X}emOQ;AJ2>`}ErdHPIx@}ZmLo)#*K>WS!+{O(07vbMBHSu0n)3(m|w
z|7=k%TClBje!6ih?5Shcf=hQe(WW{<(C~49V7nph!Vb38vFn9x^5r&eqA)h)Cw8jM
zV9=_V+)mpcXFKi`LkiNG6NeK`Nsi3y^FOoCe`$neB~sC|9^~+Im%*MUQulqswmVAM
z2~yQO{dWBJu(S@dO*R{-`>pe{rbm-qdAJerq!Q3KRLZBMVN!9Kp(vIibIk0{S>N~+
zW=dY}ki9o-b;w}$S9Lp9(5^W3BiMS<FWeV&=7kIuTNSI`bW(?y>u3_eFk?%_t_;u{
zOWI#S1!7E@RGpzQWj;`x$W84R7uY@ho8m2Ed*>xXMVnk!0;MewwSUYAiRhu+PK|Rw
z{9CHI%)BVW^zwAwy4uG3a8Z=v>zXE4NUqukLyS8X^F(Bf358AByVaQ_5hSa>4Ek-i
zYeB7t>}n*UcHNy+8iE=edq7Si3p#bJkp|kV8WNN$TU(D!Ex;Nn4eLVo^Jp^v{{Xh}
zEyn(rcLB)X7Hs^3;6T_7zJ-l{5o`q62e1?5S^u5jr`Y)Khdy`(90g0@a5xMW!9rL7
zGVgyMHvbi{8Dzh}7lQZ!euEF-i|`g$20Mbx?cW5~LLR!|AlMJ~f#<?r@L_xaC&3}`
zXY!)*qq8#e|BM7?Brqd^841itU`7H@XbFfxr2l3-=ou>Rj0Zh@U9Ww%i%O$bIw#+o
zJoBd984tR%a09ECUQ`?r+8JRT7QFG*J1qDScUVZjq|Ni0cjY2F-l*+;ypQL1<w{Z;
j_MxY^4gL0y;tiOr6Hy$`I`Znt(S%cnpjn<2?|J_J-{&HZ

diff --git a/src/caffe/Makefile b/src/caffe/Makefile
deleted file mode 100644
index fff490de..00000000
--- a/src/caffe/Makefile
+++ /dev/null
@@ -1,2279 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
-	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: install/local
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: install/strip
-.PHONY : install/strip/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/CMakeFiles/progress.marks
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-src/caffe/CMakeFiles/caffe.dir/rule:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/caffe.dir/rule
-.PHONY : src/caffe/CMakeFiles/caffe.dir/rule
-
-# Convenience name for target.
-caffe: src/caffe/CMakeFiles/caffe.dir/rule
-.PHONY : caffe
-
-# fast build rule for target.
-caffe/fast:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/build
-.PHONY : caffe/fast
-
-# Convenience name for target.
-src/caffe/CMakeFiles/proto.dir/rule:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/CMakeFiles/proto.dir/rule
-.PHONY : src/caffe/CMakeFiles/proto.dir/rule
-
-# Convenience name for target.
-proto: src/caffe/CMakeFiles/proto.dir/rule
-.PHONY : proto
-
-# fast build rule for target.
-proto/fast:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/build
-.PHONY : proto/fast
-
-__/__/include/caffe/proto/caffe.pb.o: __/__/include/caffe/proto/caffe.pb.cc.o
-.PHONY : __/__/include/caffe/proto/caffe.pb.o
-
-# target to build an object file
-__/__/include/caffe/proto/caffe.pb.cc.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.o
-.PHONY : __/__/include/caffe/proto/caffe.pb.cc.o
-
-__/__/include/caffe/proto/caffe.pb.i: __/__/include/caffe/proto/caffe.pb.cc.i
-.PHONY : __/__/include/caffe/proto/caffe.pb.i
-
-# target to preprocess a source file
-__/__/include/caffe/proto/caffe.pb.cc.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.i
-.PHONY : __/__/include/caffe/proto/caffe.pb.cc.i
-
-__/__/include/caffe/proto/caffe.pb.s: __/__/include/caffe/proto/caffe.pb.cc.s
-.PHONY : __/__/include/caffe/proto/caffe.pb.s
-
-# target to generate assembly for a file
-__/__/include/caffe/proto/caffe.pb.cc.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/proto.dir/build.make src/caffe/CMakeFiles/proto.dir/__/__/include/caffe/proto/caffe.pb.cc.s
-.PHONY : __/__/include/caffe/proto/caffe.pb.cc.s
-
-blob.o: blob.cpp.o
-.PHONY : blob.o
-
-# target to build an object file
-blob.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.o
-.PHONY : blob.cpp.o
-
-blob.i: blob.cpp.i
-.PHONY : blob.i
-
-# target to preprocess a source file
-blob.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.i
-.PHONY : blob.cpp.i
-
-blob.s: blob.cpp.s
-.PHONY : blob.s
-
-# target to generate assembly for a file
-blob.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/blob.cpp.s
-.PHONY : blob.cpp.s
-
-common.o: common.cpp.o
-.PHONY : common.o
-
-# target to build an object file
-common.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.o
-.PHONY : common.cpp.o
-
-common.i: common.cpp.i
-.PHONY : common.i
-
-# target to preprocess a source file
-common.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.i
-.PHONY : common.cpp.i
-
-common.s: common.cpp.s
-.PHONY : common.s
-
-# target to generate assembly for a file
-common.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/common.cpp.s
-.PHONY : common.cpp.s
-
-data_transformer.o: data_transformer.cpp.o
-.PHONY : data_transformer.o
-
-# target to build an object file
-data_transformer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.o
-.PHONY : data_transformer.cpp.o
-
-data_transformer.i: data_transformer.cpp.i
-.PHONY : data_transformer.i
-
-# target to preprocess a source file
-data_transformer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.i
-.PHONY : data_transformer.cpp.i
-
-data_transformer.s: data_transformer.cpp.s
-.PHONY : data_transformer.s
-
-# target to generate assembly for a file
-data_transformer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/data_transformer.cpp.s
-.PHONY : data_transformer.cpp.s
-
-device.o: device.cpp.o
-.PHONY : device.o
-
-# target to build an object file
-device.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.o
-.PHONY : device.cpp.o
-
-device.i: device.cpp.i
-.PHONY : device.i
-
-# target to preprocess a source file
-device.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.i
-.PHONY : device.cpp.i
-
-device.s: device.cpp.s
-.PHONY : device.s
-
-# target to generate assembly for a file
-device.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/device.cpp.s
-.PHONY : device.cpp.s
-
-internal_thread.o: internal_thread.cpp.o
-.PHONY : internal_thread.o
-
-# target to build an object file
-internal_thread.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.o
-.PHONY : internal_thread.cpp.o
-
-internal_thread.i: internal_thread.cpp.i
-.PHONY : internal_thread.i
-
-# target to preprocess a source file
-internal_thread.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.i
-.PHONY : internal_thread.cpp.i
-
-internal_thread.s: internal_thread.cpp.s
-.PHONY : internal_thread.s
-
-# target to generate assembly for a file
-internal_thread.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/internal_thread.cpp.s
-.PHONY : internal_thread.cpp.s
-
-layer_factory.o: layer_factory.cpp.o
-.PHONY : layer_factory.o
-
-# target to build an object file
-layer_factory.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.o
-.PHONY : layer_factory.cpp.o
-
-layer_factory.i: layer_factory.cpp.i
-.PHONY : layer_factory.i
-
-# target to preprocess a source file
-layer_factory.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.i
-.PHONY : layer_factory.cpp.i
-
-layer_factory.s: layer_factory.cpp.s
-.PHONY : layer_factory.s
-
-# target to generate assembly for a file
-layer_factory.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layer_factory.cpp.s
-.PHONY : layer_factory.cpp.s
-
-layers/absval_layer.o: layers/absval_layer.cpp.o
-.PHONY : layers/absval_layer.o
-
-# target to build an object file
-layers/absval_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.o
-.PHONY : layers/absval_layer.cpp.o
-
-layers/absval_layer.i: layers/absval_layer.cpp.i
-.PHONY : layers/absval_layer.i
-
-# target to preprocess a source file
-layers/absval_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.i
-.PHONY : layers/absval_layer.cpp.i
-
-layers/absval_layer.s: layers/absval_layer.cpp.s
-.PHONY : layers/absval_layer.s
-
-# target to generate assembly for a file
-layers/absval_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/absval_layer.cpp.s
-.PHONY : layers/absval_layer.cpp.s
-
-layers/accuracy_layer.o: layers/accuracy_layer.cpp.o
-.PHONY : layers/accuracy_layer.o
-
-# target to build an object file
-layers/accuracy_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.o
-.PHONY : layers/accuracy_layer.cpp.o
-
-layers/accuracy_layer.i: layers/accuracy_layer.cpp.i
-.PHONY : layers/accuracy_layer.i
-
-# target to preprocess a source file
-layers/accuracy_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.i
-.PHONY : layers/accuracy_layer.cpp.i
-
-layers/accuracy_layer.s: layers/accuracy_layer.cpp.s
-.PHONY : layers/accuracy_layer.s
-
-# target to generate assembly for a file
-layers/accuracy_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/accuracy_layer.cpp.s
-.PHONY : layers/accuracy_layer.cpp.s
-
-layers/argmax_layer.o: layers/argmax_layer.cpp.o
-.PHONY : layers/argmax_layer.o
-
-# target to build an object file
-layers/argmax_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.o
-.PHONY : layers/argmax_layer.cpp.o
-
-layers/argmax_layer.i: layers/argmax_layer.cpp.i
-.PHONY : layers/argmax_layer.i
-
-# target to preprocess a source file
-layers/argmax_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.i
-.PHONY : layers/argmax_layer.cpp.i
-
-layers/argmax_layer.s: layers/argmax_layer.cpp.s
-.PHONY : layers/argmax_layer.s
-
-# target to generate assembly for a file
-layers/argmax_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/argmax_layer.cpp.s
-.PHONY : layers/argmax_layer.cpp.s
-
-layers/base_conv_layer.o: layers/base_conv_layer.cpp.o
-.PHONY : layers/base_conv_layer.o
-
-# target to build an object file
-layers/base_conv_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.o
-.PHONY : layers/base_conv_layer.cpp.o
-
-layers/base_conv_layer.i: layers/base_conv_layer.cpp.i
-.PHONY : layers/base_conv_layer.i
-
-# target to preprocess a source file
-layers/base_conv_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.i
-.PHONY : layers/base_conv_layer.cpp.i
-
-layers/base_conv_layer.s: layers/base_conv_layer.cpp.s
-.PHONY : layers/base_conv_layer.s
-
-# target to generate assembly for a file
-layers/base_conv_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_conv_layer.cpp.s
-.PHONY : layers/base_conv_layer.cpp.s
-
-layers/base_data_layer.o: layers/base_data_layer.cpp.o
-.PHONY : layers/base_data_layer.o
-
-# target to build an object file
-layers/base_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.o
-.PHONY : layers/base_data_layer.cpp.o
-
-layers/base_data_layer.i: layers/base_data_layer.cpp.i
-.PHONY : layers/base_data_layer.i
-
-# target to preprocess a source file
-layers/base_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.i
-.PHONY : layers/base_data_layer.cpp.i
-
-layers/base_data_layer.s: layers/base_data_layer.cpp.s
-.PHONY : layers/base_data_layer.s
-
-# target to generate assembly for a file
-layers/base_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/base_data_layer.cpp.s
-.PHONY : layers/base_data_layer.cpp.s
-
-layers/bnll_layer.o: layers/bnll_layer.cpp.o
-.PHONY : layers/bnll_layer.o
-
-# target to build an object file
-layers/bnll_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.o
-.PHONY : layers/bnll_layer.cpp.o
-
-layers/bnll_layer.i: layers/bnll_layer.cpp.i
-.PHONY : layers/bnll_layer.i
-
-# target to preprocess a source file
-layers/bnll_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.i
-.PHONY : layers/bnll_layer.cpp.i
-
-layers/bnll_layer.s: layers/bnll_layer.cpp.s
-.PHONY : layers/bnll_layer.s
-
-# target to generate assembly for a file
-layers/bnll_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/bnll_layer.cpp.s
-.PHONY : layers/bnll_layer.cpp.s
-
-layers/concat_layer.o: layers/concat_layer.cpp.o
-.PHONY : layers/concat_layer.o
-
-# target to build an object file
-layers/concat_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.o
-.PHONY : layers/concat_layer.cpp.o
-
-layers/concat_layer.i: layers/concat_layer.cpp.i
-.PHONY : layers/concat_layer.i
-
-# target to preprocess a source file
-layers/concat_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.i
-.PHONY : layers/concat_layer.cpp.i
-
-layers/concat_layer.s: layers/concat_layer.cpp.s
-.PHONY : layers/concat_layer.s
-
-# target to generate assembly for a file
-layers/concat_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/concat_layer.cpp.s
-.PHONY : layers/concat_layer.cpp.s
-
-layers/contrastive_loss_layer.o: layers/contrastive_loss_layer.cpp.o
-.PHONY : layers/contrastive_loss_layer.o
-
-# target to build an object file
-layers/contrastive_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.o
-.PHONY : layers/contrastive_loss_layer.cpp.o
-
-layers/contrastive_loss_layer.i: layers/contrastive_loss_layer.cpp.i
-.PHONY : layers/contrastive_loss_layer.i
-
-# target to preprocess a source file
-layers/contrastive_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.i
-.PHONY : layers/contrastive_loss_layer.cpp.i
-
-layers/contrastive_loss_layer.s: layers/contrastive_loss_layer.cpp.s
-.PHONY : layers/contrastive_loss_layer.s
-
-# target to generate assembly for a file
-layers/contrastive_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/contrastive_loss_layer.cpp.s
-.PHONY : layers/contrastive_loss_layer.cpp.s
-
-layers/conv_layer.o: layers/conv_layer.cpp.o
-.PHONY : layers/conv_layer.o
-
-# target to build an object file
-layers/conv_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.o
-.PHONY : layers/conv_layer.cpp.o
-
-layers/conv_layer.i: layers/conv_layer.cpp.i
-.PHONY : layers/conv_layer.i
-
-# target to preprocess a source file
-layers/conv_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.i
-.PHONY : layers/conv_layer.cpp.i
-
-layers/conv_layer.s: layers/conv_layer.cpp.s
-.PHONY : layers/conv_layer.s
-
-# target to generate assembly for a file
-layers/conv_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/conv_layer.cpp.s
-.PHONY : layers/conv_layer.cpp.s
-
-layers/cudnn_conv_layer.o: layers/cudnn_conv_layer.cpp.o
-.PHONY : layers/cudnn_conv_layer.o
-
-# target to build an object file
-layers/cudnn_conv_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.o
-.PHONY : layers/cudnn_conv_layer.cpp.o
-
-layers/cudnn_conv_layer.i: layers/cudnn_conv_layer.cpp.i
-.PHONY : layers/cudnn_conv_layer.i
-
-# target to preprocess a source file
-layers/cudnn_conv_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.i
-.PHONY : layers/cudnn_conv_layer.cpp.i
-
-layers/cudnn_conv_layer.s: layers/cudnn_conv_layer.cpp.s
-.PHONY : layers/cudnn_conv_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_conv_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_conv_layer.cpp.s
-.PHONY : layers/cudnn_conv_layer.cpp.s
-
-layers/cudnn_pooling_layer.o: layers/cudnn_pooling_layer.cpp.o
-.PHONY : layers/cudnn_pooling_layer.o
-
-# target to build an object file
-layers/cudnn_pooling_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.o
-.PHONY : layers/cudnn_pooling_layer.cpp.o
-
-layers/cudnn_pooling_layer.i: layers/cudnn_pooling_layer.cpp.i
-.PHONY : layers/cudnn_pooling_layer.i
-
-# target to preprocess a source file
-layers/cudnn_pooling_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.i
-.PHONY : layers/cudnn_pooling_layer.cpp.i
-
-layers/cudnn_pooling_layer.s: layers/cudnn_pooling_layer.cpp.s
-.PHONY : layers/cudnn_pooling_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_pooling_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_pooling_layer.cpp.s
-.PHONY : layers/cudnn_pooling_layer.cpp.s
-
-layers/cudnn_relu_layer.o: layers/cudnn_relu_layer.cpp.o
-.PHONY : layers/cudnn_relu_layer.o
-
-# target to build an object file
-layers/cudnn_relu_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.o
-.PHONY : layers/cudnn_relu_layer.cpp.o
-
-layers/cudnn_relu_layer.i: layers/cudnn_relu_layer.cpp.i
-.PHONY : layers/cudnn_relu_layer.i
-
-# target to preprocess a source file
-layers/cudnn_relu_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.i
-.PHONY : layers/cudnn_relu_layer.cpp.i
-
-layers/cudnn_relu_layer.s: layers/cudnn_relu_layer.cpp.s
-.PHONY : layers/cudnn_relu_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_relu_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_relu_layer.cpp.s
-.PHONY : layers/cudnn_relu_layer.cpp.s
-
-layers/cudnn_sigmoid_layer.o: layers/cudnn_sigmoid_layer.cpp.o
-.PHONY : layers/cudnn_sigmoid_layer.o
-
-# target to build an object file
-layers/cudnn_sigmoid_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.o
-.PHONY : layers/cudnn_sigmoid_layer.cpp.o
-
-layers/cudnn_sigmoid_layer.i: layers/cudnn_sigmoid_layer.cpp.i
-.PHONY : layers/cudnn_sigmoid_layer.i
-
-# target to preprocess a source file
-layers/cudnn_sigmoid_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.i
-.PHONY : layers/cudnn_sigmoid_layer.cpp.i
-
-layers/cudnn_sigmoid_layer.s: layers/cudnn_sigmoid_layer.cpp.s
-.PHONY : layers/cudnn_sigmoid_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_sigmoid_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_sigmoid_layer.cpp.s
-.PHONY : layers/cudnn_sigmoid_layer.cpp.s
-
-layers/cudnn_softmax_layer.o: layers/cudnn_softmax_layer.cpp.o
-.PHONY : layers/cudnn_softmax_layer.o
-
-# target to build an object file
-layers/cudnn_softmax_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.o
-.PHONY : layers/cudnn_softmax_layer.cpp.o
-
-layers/cudnn_softmax_layer.i: layers/cudnn_softmax_layer.cpp.i
-.PHONY : layers/cudnn_softmax_layer.i
-
-# target to preprocess a source file
-layers/cudnn_softmax_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.i
-.PHONY : layers/cudnn_softmax_layer.cpp.i
-
-layers/cudnn_softmax_layer.s: layers/cudnn_softmax_layer.cpp.s
-.PHONY : layers/cudnn_softmax_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_softmax_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_softmax_layer.cpp.s
-.PHONY : layers/cudnn_softmax_layer.cpp.s
-
-layers/cudnn_tanh_layer.o: layers/cudnn_tanh_layer.cpp.o
-.PHONY : layers/cudnn_tanh_layer.o
-
-# target to build an object file
-layers/cudnn_tanh_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.o
-.PHONY : layers/cudnn_tanh_layer.cpp.o
-
-layers/cudnn_tanh_layer.i: layers/cudnn_tanh_layer.cpp.i
-.PHONY : layers/cudnn_tanh_layer.i
-
-# target to preprocess a source file
-layers/cudnn_tanh_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.i
-.PHONY : layers/cudnn_tanh_layer.cpp.i
-
-layers/cudnn_tanh_layer.s: layers/cudnn_tanh_layer.cpp.s
-.PHONY : layers/cudnn_tanh_layer.s
-
-# target to generate assembly for a file
-layers/cudnn_tanh_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/cudnn_tanh_layer.cpp.s
-.PHONY : layers/cudnn_tanh_layer.cpp.s
-
-layers/data_layer.o: layers/data_layer.cpp.o
-.PHONY : layers/data_layer.o
-
-# target to build an object file
-layers/data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.o
-.PHONY : layers/data_layer.cpp.o
-
-layers/data_layer.i: layers/data_layer.cpp.i
-.PHONY : layers/data_layer.i
-
-# target to preprocess a source file
-layers/data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.i
-.PHONY : layers/data_layer.cpp.i
-
-layers/data_layer.s: layers/data_layer.cpp.s
-.PHONY : layers/data_layer.s
-
-# target to generate assembly for a file
-layers/data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/data_layer.cpp.s
-.PHONY : layers/data_layer.cpp.s
-
-layers/deconv_layer.o: layers/deconv_layer.cpp.o
-.PHONY : layers/deconv_layer.o
-
-# target to build an object file
-layers/deconv_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.o
-.PHONY : layers/deconv_layer.cpp.o
-
-layers/deconv_layer.i: layers/deconv_layer.cpp.i
-.PHONY : layers/deconv_layer.i
-
-# target to preprocess a source file
-layers/deconv_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.i
-.PHONY : layers/deconv_layer.cpp.i
-
-layers/deconv_layer.s: layers/deconv_layer.cpp.s
-.PHONY : layers/deconv_layer.s
-
-# target to generate assembly for a file
-layers/deconv_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/deconv_layer.cpp.s
-.PHONY : layers/deconv_layer.cpp.s
-
-layers/dropout_layer.o: layers/dropout_layer.cpp.o
-.PHONY : layers/dropout_layer.o
-
-# target to build an object file
-layers/dropout_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.o
-.PHONY : layers/dropout_layer.cpp.o
-
-layers/dropout_layer.i: layers/dropout_layer.cpp.i
-.PHONY : layers/dropout_layer.i
-
-# target to preprocess a source file
-layers/dropout_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.i
-.PHONY : layers/dropout_layer.cpp.i
-
-layers/dropout_layer.s: layers/dropout_layer.cpp.s
-.PHONY : layers/dropout_layer.s
-
-# target to generate assembly for a file
-layers/dropout_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dropout_layer.cpp.s
-.PHONY : layers/dropout_layer.cpp.s
-
-layers/dummy_data_layer.o: layers/dummy_data_layer.cpp.o
-.PHONY : layers/dummy_data_layer.o
-
-# target to build an object file
-layers/dummy_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.o
-.PHONY : layers/dummy_data_layer.cpp.o
-
-layers/dummy_data_layer.i: layers/dummy_data_layer.cpp.i
-.PHONY : layers/dummy_data_layer.i
-
-# target to preprocess a source file
-layers/dummy_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.i
-.PHONY : layers/dummy_data_layer.cpp.i
-
-layers/dummy_data_layer.s: layers/dummy_data_layer.cpp.s
-.PHONY : layers/dummy_data_layer.s
-
-# target to generate assembly for a file
-layers/dummy_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/dummy_data_layer.cpp.s
-.PHONY : layers/dummy_data_layer.cpp.s
-
-layers/eltwise_layer.o: layers/eltwise_layer.cpp.o
-.PHONY : layers/eltwise_layer.o
-
-# target to build an object file
-layers/eltwise_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.o
-.PHONY : layers/eltwise_layer.cpp.o
-
-layers/eltwise_layer.i: layers/eltwise_layer.cpp.i
-.PHONY : layers/eltwise_layer.i
-
-# target to preprocess a source file
-layers/eltwise_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.i
-.PHONY : layers/eltwise_layer.cpp.i
-
-layers/eltwise_layer.s: layers/eltwise_layer.cpp.s
-.PHONY : layers/eltwise_layer.s
-
-# target to generate assembly for a file
-layers/eltwise_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/eltwise_layer.cpp.s
-.PHONY : layers/eltwise_layer.cpp.s
-
-layers/euclidean_loss_layer.o: layers/euclidean_loss_layer.cpp.o
-.PHONY : layers/euclidean_loss_layer.o
-
-# target to build an object file
-layers/euclidean_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.o
-.PHONY : layers/euclidean_loss_layer.cpp.o
-
-layers/euclidean_loss_layer.i: layers/euclidean_loss_layer.cpp.i
-.PHONY : layers/euclidean_loss_layer.i
-
-# target to preprocess a source file
-layers/euclidean_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.i
-.PHONY : layers/euclidean_loss_layer.cpp.i
-
-layers/euclidean_loss_layer.s: layers/euclidean_loss_layer.cpp.s
-.PHONY : layers/euclidean_loss_layer.s
-
-# target to generate assembly for a file
-layers/euclidean_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/euclidean_loss_layer.cpp.s
-.PHONY : layers/euclidean_loss_layer.cpp.s
-
-layers/exp_layer.o: layers/exp_layer.cpp.o
-.PHONY : layers/exp_layer.o
-
-# target to build an object file
-layers/exp_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.o
-.PHONY : layers/exp_layer.cpp.o
-
-layers/exp_layer.i: layers/exp_layer.cpp.i
-.PHONY : layers/exp_layer.i
-
-# target to preprocess a source file
-layers/exp_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.i
-.PHONY : layers/exp_layer.cpp.i
-
-layers/exp_layer.s: layers/exp_layer.cpp.s
-.PHONY : layers/exp_layer.s
-
-# target to generate assembly for a file
-layers/exp_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/exp_layer.cpp.s
-.PHONY : layers/exp_layer.cpp.s
-
-layers/filter_layer.o: layers/filter_layer.cpp.o
-.PHONY : layers/filter_layer.o
-
-# target to build an object file
-layers/filter_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.o
-.PHONY : layers/filter_layer.cpp.o
-
-layers/filter_layer.i: layers/filter_layer.cpp.i
-.PHONY : layers/filter_layer.i
-
-# target to preprocess a source file
-layers/filter_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.i
-.PHONY : layers/filter_layer.cpp.i
-
-layers/filter_layer.s: layers/filter_layer.cpp.s
-.PHONY : layers/filter_layer.s
-
-# target to generate assembly for a file
-layers/filter_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/filter_layer.cpp.s
-.PHONY : layers/filter_layer.cpp.s
-
-layers/flatten_layer.o: layers/flatten_layer.cpp.o
-.PHONY : layers/flatten_layer.o
-
-# target to build an object file
-layers/flatten_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.o
-.PHONY : layers/flatten_layer.cpp.o
-
-layers/flatten_layer.i: layers/flatten_layer.cpp.i
-.PHONY : layers/flatten_layer.i
-
-# target to preprocess a source file
-layers/flatten_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.i
-.PHONY : layers/flatten_layer.cpp.i
-
-layers/flatten_layer.s: layers/flatten_layer.cpp.s
-.PHONY : layers/flatten_layer.s
-
-# target to generate assembly for a file
-layers/flatten_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/flatten_layer.cpp.s
-.PHONY : layers/flatten_layer.cpp.s
-
-layers/hdf5_data_layer.o: layers/hdf5_data_layer.cpp.o
-.PHONY : layers/hdf5_data_layer.o
-
-# target to build an object file
-layers/hdf5_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.o
-.PHONY : layers/hdf5_data_layer.cpp.o
-
-layers/hdf5_data_layer.i: layers/hdf5_data_layer.cpp.i
-.PHONY : layers/hdf5_data_layer.i
-
-# target to preprocess a source file
-layers/hdf5_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.i
-.PHONY : layers/hdf5_data_layer.cpp.i
-
-layers/hdf5_data_layer.s: layers/hdf5_data_layer.cpp.s
-.PHONY : layers/hdf5_data_layer.s
-
-# target to generate assembly for a file
-layers/hdf5_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_data_layer.cpp.s
-.PHONY : layers/hdf5_data_layer.cpp.s
-
-layers/hdf5_output_layer.o: layers/hdf5_output_layer.cpp.o
-.PHONY : layers/hdf5_output_layer.o
-
-# target to build an object file
-layers/hdf5_output_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.o
-.PHONY : layers/hdf5_output_layer.cpp.o
-
-layers/hdf5_output_layer.i: layers/hdf5_output_layer.cpp.i
-.PHONY : layers/hdf5_output_layer.i
-
-# target to preprocess a source file
-layers/hdf5_output_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.i
-.PHONY : layers/hdf5_output_layer.cpp.i
-
-layers/hdf5_output_layer.s: layers/hdf5_output_layer.cpp.s
-.PHONY : layers/hdf5_output_layer.s
-
-# target to generate assembly for a file
-layers/hdf5_output_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hdf5_output_layer.cpp.s
-.PHONY : layers/hdf5_output_layer.cpp.s
-
-layers/hinge_loss_layer.o: layers/hinge_loss_layer.cpp.o
-.PHONY : layers/hinge_loss_layer.o
-
-# target to build an object file
-layers/hinge_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.o
-.PHONY : layers/hinge_loss_layer.cpp.o
-
-layers/hinge_loss_layer.i: layers/hinge_loss_layer.cpp.i
-.PHONY : layers/hinge_loss_layer.i
-
-# target to preprocess a source file
-layers/hinge_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.i
-.PHONY : layers/hinge_loss_layer.cpp.i
-
-layers/hinge_loss_layer.s: layers/hinge_loss_layer.cpp.s
-.PHONY : layers/hinge_loss_layer.s
-
-# target to generate assembly for a file
-layers/hinge_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/hinge_loss_layer.cpp.s
-.PHONY : layers/hinge_loss_layer.cpp.s
-
-layers/im2col_layer.o: layers/im2col_layer.cpp.o
-.PHONY : layers/im2col_layer.o
-
-# target to build an object file
-layers/im2col_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.o
-.PHONY : layers/im2col_layer.cpp.o
-
-layers/im2col_layer.i: layers/im2col_layer.cpp.i
-.PHONY : layers/im2col_layer.i
-
-# target to preprocess a source file
-layers/im2col_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.i
-.PHONY : layers/im2col_layer.cpp.i
-
-layers/im2col_layer.s: layers/im2col_layer.cpp.s
-.PHONY : layers/im2col_layer.s
-
-# target to generate assembly for a file
-layers/im2col_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/im2col_layer.cpp.s
-.PHONY : layers/im2col_layer.cpp.s
-
-layers/image_data_layer.o: layers/image_data_layer.cpp.o
-.PHONY : layers/image_data_layer.o
-
-# target to build an object file
-layers/image_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.o
-.PHONY : layers/image_data_layer.cpp.o
-
-layers/image_data_layer.i: layers/image_data_layer.cpp.i
-.PHONY : layers/image_data_layer.i
-
-# target to preprocess a source file
-layers/image_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.i
-.PHONY : layers/image_data_layer.cpp.i
-
-layers/image_data_layer.s: layers/image_data_layer.cpp.s
-.PHONY : layers/image_data_layer.s
-
-# target to generate assembly for a file
-layers/image_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/image_data_layer.cpp.s
-.PHONY : layers/image_data_layer.cpp.s
-
-layers/infogain_loss_layer.o: layers/infogain_loss_layer.cpp.o
-.PHONY : layers/infogain_loss_layer.o
-
-# target to build an object file
-layers/infogain_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.o
-.PHONY : layers/infogain_loss_layer.cpp.o
-
-layers/infogain_loss_layer.i: layers/infogain_loss_layer.cpp.i
-.PHONY : layers/infogain_loss_layer.i
-
-# target to preprocess a source file
-layers/infogain_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.i
-.PHONY : layers/infogain_loss_layer.cpp.i
-
-layers/infogain_loss_layer.s: layers/infogain_loss_layer.cpp.s
-.PHONY : layers/infogain_loss_layer.s
-
-# target to generate assembly for a file
-layers/infogain_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/infogain_loss_layer.cpp.s
-.PHONY : layers/infogain_loss_layer.cpp.s
-
-layers/inner_product_layer.o: layers/inner_product_layer.cpp.o
-.PHONY : layers/inner_product_layer.o
-
-# target to build an object file
-layers/inner_product_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.o
-.PHONY : layers/inner_product_layer.cpp.o
-
-layers/inner_product_layer.i: layers/inner_product_layer.cpp.i
-.PHONY : layers/inner_product_layer.i
-
-# target to preprocess a source file
-layers/inner_product_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.i
-.PHONY : layers/inner_product_layer.cpp.i
-
-layers/inner_product_layer.s: layers/inner_product_layer.cpp.s
-.PHONY : layers/inner_product_layer.s
-
-# target to generate assembly for a file
-layers/inner_product_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/inner_product_layer.cpp.s
-.PHONY : layers/inner_product_layer.cpp.s
-
-layers/log_layer.o: layers/log_layer.cpp.o
-.PHONY : layers/log_layer.o
-
-# target to build an object file
-layers/log_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.o
-.PHONY : layers/log_layer.cpp.o
-
-layers/log_layer.i: layers/log_layer.cpp.i
-.PHONY : layers/log_layer.i
-
-# target to preprocess a source file
-layers/log_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.i
-.PHONY : layers/log_layer.cpp.i
-
-layers/log_layer.s: layers/log_layer.cpp.s
-.PHONY : layers/log_layer.s
-
-# target to generate assembly for a file
-layers/log_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/log_layer.cpp.s
-.PHONY : layers/log_layer.cpp.s
-
-layers/loss_layer.o: layers/loss_layer.cpp.o
-.PHONY : layers/loss_layer.o
-
-# target to build an object file
-layers/loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.o
-.PHONY : layers/loss_layer.cpp.o
-
-layers/loss_layer.i: layers/loss_layer.cpp.i
-.PHONY : layers/loss_layer.i
-
-# target to preprocess a source file
-layers/loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.i
-.PHONY : layers/loss_layer.cpp.i
-
-layers/loss_layer.s: layers/loss_layer.cpp.s
-.PHONY : layers/loss_layer.s
-
-# target to generate assembly for a file
-layers/loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/loss_layer.cpp.s
-.PHONY : layers/loss_layer.cpp.s
-
-layers/lrn_layer.o: layers/lrn_layer.cpp.o
-.PHONY : layers/lrn_layer.o
-
-# target to build an object file
-layers/lrn_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.o
-.PHONY : layers/lrn_layer.cpp.o
-
-layers/lrn_layer.i: layers/lrn_layer.cpp.i
-.PHONY : layers/lrn_layer.i
-
-# target to preprocess a source file
-layers/lrn_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.i
-.PHONY : layers/lrn_layer.cpp.i
-
-layers/lrn_layer.s: layers/lrn_layer.cpp.s
-.PHONY : layers/lrn_layer.s
-
-# target to generate assembly for a file
-layers/lrn_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/lrn_layer.cpp.s
-.PHONY : layers/lrn_layer.cpp.s
-
-layers/memory_data_layer.o: layers/memory_data_layer.cpp.o
-.PHONY : layers/memory_data_layer.o
-
-# target to build an object file
-layers/memory_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.o
-.PHONY : layers/memory_data_layer.cpp.o
-
-layers/memory_data_layer.i: layers/memory_data_layer.cpp.i
-.PHONY : layers/memory_data_layer.i
-
-# target to preprocess a source file
-layers/memory_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.i
-.PHONY : layers/memory_data_layer.cpp.i
-
-layers/memory_data_layer.s: layers/memory_data_layer.cpp.s
-.PHONY : layers/memory_data_layer.s
-
-# target to generate assembly for a file
-layers/memory_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/memory_data_layer.cpp.s
-.PHONY : layers/memory_data_layer.cpp.s
-
-layers/multinomial_logistic_loss_layer.o: layers/multinomial_logistic_loss_layer.cpp.o
-.PHONY : layers/multinomial_logistic_loss_layer.o
-
-# target to build an object file
-layers/multinomial_logistic_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.o
-.PHONY : layers/multinomial_logistic_loss_layer.cpp.o
-
-layers/multinomial_logistic_loss_layer.i: layers/multinomial_logistic_loss_layer.cpp.i
-.PHONY : layers/multinomial_logistic_loss_layer.i
-
-# target to preprocess a source file
-layers/multinomial_logistic_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.i
-.PHONY : layers/multinomial_logistic_loss_layer.cpp.i
-
-layers/multinomial_logistic_loss_layer.s: layers/multinomial_logistic_loss_layer.cpp.s
-.PHONY : layers/multinomial_logistic_loss_layer.s
-
-# target to generate assembly for a file
-layers/multinomial_logistic_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/multinomial_logistic_loss_layer.cpp.s
-.PHONY : layers/multinomial_logistic_loss_layer.cpp.s
-
-layers/mvn_layer.o: layers/mvn_layer.cpp.o
-.PHONY : layers/mvn_layer.o
-
-# target to build an object file
-layers/mvn_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.o
-.PHONY : layers/mvn_layer.cpp.o
-
-layers/mvn_layer.i: layers/mvn_layer.cpp.i
-.PHONY : layers/mvn_layer.i
-
-# target to preprocess a source file
-layers/mvn_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.i
-.PHONY : layers/mvn_layer.cpp.i
-
-layers/mvn_layer.s: layers/mvn_layer.cpp.s
-.PHONY : layers/mvn_layer.s
-
-# target to generate assembly for a file
-layers/mvn_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/mvn_layer.cpp.s
-.PHONY : layers/mvn_layer.cpp.s
-
-layers/neuron_layer.o: layers/neuron_layer.cpp.o
-.PHONY : layers/neuron_layer.o
-
-# target to build an object file
-layers/neuron_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.o
-.PHONY : layers/neuron_layer.cpp.o
-
-layers/neuron_layer.i: layers/neuron_layer.cpp.i
-.PHONY : layers/neuron_layer.i
-
-# target to preprocess a source file
-layers/neuron_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.i
-.PHONY : layers/neuron_layer.cpp.i
-
-layers/neuron_layer.s: layers/neuron_layer.cpp.s
-.PHONY : layers/neuron_layer.s
-
-# target to generate assembly for a file
-layers/neuron_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/neuron_layer.cpp.s
-.PHONY : layers/neuron_layer.cpp.s
-
-layers/pooling_layer.o: layers/pooling_layer.cpp.o
-.PHONY : layers/pooling_layer.o
-
-# target to build an object file
-layers/pooling_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.o
-.PHONY : layers/pooling_layer.cpp.o
-
-layers/pooling_layer.i: layers/pooling_layer.cpp.i
-.PHONY : layers/pooling_layer.i
-
-# target to preprocess a source file
-layers/pooling_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.i
-.PHONY : layers/pooling_layer.cpp.i
-
-layers/pooling_layer.s: layers/pooling_layer.cpp.s
-.PHONY : layers/pooling_layer.s
-
-# target to generate assembly for a file
-layers/pooling_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/pooling_layer.cpp.s
-.PHONY : layers/pooling_layer.cpp.s
-
-layers/power_layer.o: layers/power_layer.cpp.o
-.PHONY : layers/power_layer.o
-
-# target to build an object file
-layers/power_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.o
-.PHONY : layers/power_layer.cpp.o
-
-layers/power_layer.i: layers/power_layer.cpp.i
-.PHONY : layers/power_layer.i
-
-# target to preprocess a source file
-layers/power_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.i
-.PHONY : layers/power_layer.cpp.i
-
-layers/power_layer.s: layers/power_layer.cpp.s
-.PHONY : layers/power_layer.s
-
-# target to generate assembly for a file
-layers/power_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/power_layer.cpp.s
-.PHONY : layers/power_layer.cpp.s
-
-layers/prelu_layer.o: layers/prelu_layer.cpp.o
-.PHONY : layers/prelu_layer.o
-
-# target to build an object file
-layers/prelu_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.o
-.PHONY : layers/prelu_layer.cpp.o
-
-layers/prelu_layer.i: layers/prelu_layer.cpp.i
-.PHONY : layers/prelu_layer.i
-
-# target to preprocess a source file
-layers/prelu_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.i
-.PHONY : layers/prelu_layer.cpp.i
-
-layers/prelu_layer.s: layers/prelu_layer.cpp.s
-.PHONY : layers/prelu_layer.s
-
-# target to generate assembly for a file
-layers/prelu_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/prelu_layer.cpp.s
-.PHONY : layers/prelu_layer.cpp.s
-
-layers/reduction_layer.o: layers/reduction_layer.cpp.o
-.PHONY : layers/reduction_layer.o
-
-# target to build an object file
-layers/reduction_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.o
-.PHONY : layers/reduction_layer.cpp.o
-
-layers/reduction_layer.i: layers/reduction_layer.cpp.i
-.PHONY : layers/reduction_layer.i
-
-# target to preprocess a source file
-layers/reduction_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.i
-.PHONY : layers/reduction_layer.cpp.i
-
-layers/reduction_layer.s: layers/reduction_layer.cpp.s
-.PHONY : layers/reduction_layer.s
-
-# target to generate assembly for a file
-layers/reduction_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reduction_layer.cpp.s
-.PHONY : layers/reduction_layer.cpp.s
-
-layers/relu_layer.o: layers/relu_layer.cpp.o
-.PHONY : layers/relu_layer.o
-
-# target to build an object file
-layers/relu_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.o
-.PHONY : layers/relu_layer.cpp.o
-
-layers/relu_layer.i: layers/relu_layer.cpp.i
-.PHONY : layers/relu_layer.i
-
-# target to preprocess a source file
-layers/relu_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.i
-.PHONY : layers/relu_layer.cpp.i
-
-layers/relu_layer.s: layers/relu_layer.cpp.s
-.PHONY : layers/relu_layer.s
-
-# target to generate assembly for a file
-layers/relu_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/relu_layer.cpp.s
-.PHONY : layers/relu_layer.cpp.s
-
-layers/reshape_layer.o: layers/reshape_layer.cpp.o
-.PHONY : layers/reshape_layer.o
-
-# target to build an object file
-layers/reshape_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.o
-.PHONY : layers/reshape_layer.cpp.o
-
-layers/reshape_layer.i: layers/reshape_layer.cpp.i
-.PHONY : layers/reshape_layer.i
-
-# target to preprocess a source file
-layers/reshape_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.i
-.PHONY : layers/reshape_layer.cpp.i
-
-layers/reshape_layer.s: layers/reshape_layer.cpp.s
-.PHONY : layers/reshape_layer.s
-
-# target to generate assembly for a file
-layers/reshape_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/reshape_layer.cpp.s
-.PHONY : layers/reshape_layer.cpp.s
-
-layers/sigmoid_cross_entropy_loss_layer.o: layers/sigmoid_cross_entropy_loss_layer.cpp.o
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.o
-
-# target to build an object file
-layers/sigmoid_cross_entropy_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.o
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.o
-
-layers/sigmoid_cross_entropy_loss_layer.i: layers/sigmoid_cross_entropy_loss_layer.cpp.i
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.i
-
-# target to preprocess a source file
-layers/sigmoid_cross_entropy_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.i
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.i
-
-layers/sigmoid_cross_entropy_loss_layer.s: layers/sigmoid_cross_entropy_loss_layer.cpp.s
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.s
-
-# target to generate assembly for a file
-layers/sigmoid_cross_entropy_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_cross_entropy_loss_layer.cpp.s
-.PHONY : layers/sigmoid_cross_entropy_loss_layer.cpp.s
-
-layers/sigmoid_layer.o: layers/sigmoid_layer.cpp.o
-.PHONY : layers/sigmoid_layer.o
-
-# target to build an object file
-layers/sigmoid_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.o
-.PHONY : layers/sigmoid_layer.cpp.o
-
-layers/sigmoid_layer.i: layers/sigmoid_layer.cpp.i
-.PHONY : layers/sigmoid_layer.i
-
-# target to preprocess a source file
-layers/sigmoid_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.i
-.PHONY : layers/sigmoid_layer.cpp.i
-
-layers/sigmoid_layer.s: layers/sigmoid_layer.cpp.s
-.PHONY : layers/sigmoid_layer.s
-
-# target to generate assembly for a file
-layers/sigmoid_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/sigmoid_layer.cpp.s
-.PHONY : layers/sigmoid_layer.cpp.s
-
-layers/silence_layer.o: layers/silence_layer.cpp.o
-.PHONY : layers/silence_layer.o
-
-# target to build an object file
-layers/silence_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.o
-.PHONY : layers/silence_layer.cpp.o
-
-layers/silence_layer.i: layers/silence_layer.cpp.i
-.PHONY : layers/silence_layer.i
-
-# target to preprocess a source file
-layers/silence_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.i
-.PHONY : layers/silence_layer.cpp.i
-
-layers/silence_layer.s: layers/silence_layer.cpp.s
-.PHONY : layers/silence_layer.s
-
-# target to generate assembly for a file
-layers/silence_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/silence_layer.cpp.s
-.PHONY : layers/silence_layer.cpp.s
-
-layers/slice_layer.o: layers/slice_layer.cpp.o
-.PHONY : layers/slice_layer.o
-
-# target to build an object file
-layers/slice_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.o
-.PHONY : layers/slice_layer.cpp.o
-
-layers/slice_layer.i: layers/slice_layer.cpp.i
-.PHONY : layers/slice_layer.i
-
-# target to preprocess a source file
-layers/slice_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.i
-.PHONY : layers/slice_layer.cpp.i
-
-layers/slice_layer.s: layers/slice_layer.cpp.s
-.PHONY : layers/slice_layer.s
-
-# target to generate assembly for a file
-layers/slice_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/slice_layer.cpp.s
-.PHONY : layers/slice_layer.cpp.s
-
-layers/softmax_layer.o: layers/softmax_layer.cpp.o
-.PHONY : layers/softmax_layer.o
-
-# target to build an object file
-layers/softmax_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.o
-.PHONY : layers/softmax_layer.cpp.o
-
-layers/softmax_layer.i: layers/softmax_layer.cpp.i
-.PHONY : layers/softmax_layer.i
-
-# target to preprocess a source file
-layers/softmax_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.i
-.PHONY : layers/softmax_layer.cpp.i
-
-layers/softmax_layer.s: layers/softmax_layer.cpp.s
-.PHONY : layers/softmax_layer.s
-
-# target to generate assembly for a file
-layers/softmax_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_layer.cpp.s
-.PHONY : layers/softmax_layer.cpp.s
-
-layers/softmax_loss_layer.o: layers/softmax_loss_layer.cpp.o
-.PHONY : layers/softmax_loss_layer.o
-
-# target to build an object file
-layers/softmax_loss_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.o
-.PHONY : layers/softmax_loss_layer.cpp.o
-
-layers/softmax_loss_layer.i: layers/softmax_loss_layer.cpp.i
-.PHONY : layers/softmax_loss_layer.i
-
-# target to preprocess a source file
-layers/softmax_loss_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.i
-.PHONY : layers/softmax_loss_layer.cpp.i
-
-layers/softmax_loss_layer.s: layers/softmax_loss_layer.cpp.s
-.PHONY : layers/softmax_loss_layer.s
-
-# target to generate assembly for a file
-layers/softmax_loss_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/softmax_loss_layer.cpp.s
-.PHONY : layers/softmax_loss_layer.cpp.s
-
-layers/split_layer.o: layers/split_layer.cpp.o
-.PHONY : layers/split_layer.o
-
-# target to build an object file
-layers/split_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.o
-.PHONY : layers/split_layer.cpp.o
-
-layers/split_layer.i: layers/split_layer.cpp.i
-.PHONY : layers/split_layer.i
-
-# target to preprocess a source file
-layers/split_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.i
-.PHONY : layers/split_layer.cpp.i
-
-layers/split_layer.s: layers/split_layer.cpp.s
-.PHONY : layers/split_layer.s
-
-# target to generate assembly for a file
-layers/split_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/split_layer.cpp.s
-.PHONY : layers/split_layer.cpp.s
-
-layers/spp_layer.o: layers/spp_layer.cpp.o
-.PHONY : layers/spp_layer.o
-
-# target to build an object file
-layers/spp_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.o
-.PHONY : layers/spp_layer.cpp.o
-
-layers/spp_layer.i: layers/spp_layer.cpp.i
-.PHONY : layers/spp_layer.i
-
-# target to preprocess a source file
-layers/spp_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.i
-.PHONY : layers/spp_layer.cpp.i
-
-layers/spp_layer.s: layers/spp_layer.cpp.s
-.PHONY : layers/spp_layer.s
-
-# target to generate assembly for a file
-layers/spp_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/spp_layer.cpp.s
-.PHONY : layers/spp_layer.cpp.s
-
-layers/tanh_layer.o: layers/tanh_layer.cpp.o
-.PHONY : layers/tanh_layer.o
-
-# target to build an object file
-layers/tanh_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.o
-.PHONY : layers/tanh_layer.cpp.o
-
-layers/tanh_layer.i: layers/tanh_layer.cpp.i
-.PHONY : layers/tanh_layer.i
-
-# target to preprocess a source file
-layers/tanh_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.i
-.PHONY : layers/tanh_layer.cpp.i
-
-layers/tanh_layer.s: layers/tanh_layer.cpp.s
-.PHONY : layers/tanh_layer.s
-
-# target to generate assembly for a file
-layers/tanh_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/tanh_layer.cpp.s
-.PHONY : layers/tanh_layer.cpp.s
-
-layers/threshold_layer.o: layers/threshold_layer.cpp.o
-.PHONY : layers/threshold_layer.o
-
-# target to build an object file
-layers/threshold_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.o
-.PHONY : layers/threshold_layer.cpp.o
-
-layers/threshold_layer.i: layers/threshold_layer.cpp.i
-.PHONY : layers/threshold_layer.i
-
-# target to preprocess a source file
-layers/threshold_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.i
-.PHONY : layers/threshold_layer.cpp.i
-
-layers/threshold_layer.s: layers/threshold_layer.cpp.s
-.PHONY : layers/threshold_layer.s
-
-# target to generate assembly for a file
-layers/threshold_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/threshold_layer.cpp.s
-.PHONY : layers/threshold_layer.cpp.s
-
-layers/window_data_layer.o: layers/window_data_layer.cpp.o
-.PHONY : layers/window_data_layer.o
-
-# target to build an object file
-layers/window_data_layer.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.o
-.PHONY : layers/window_data_layer.cpp.o
-
-layers/window_data_layer.i: layers/window_data_layer.cpp.i
-.PHONY : layers/window_data_layer.i
-
-# target to preprocess a source file
-layers/window_data_layer.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.i
-.PHONY : layers/window_data_layer.cpp.i
-
-layers/window_data_layer.s: layers/window_data_layer.cpp.s
-.PHONY : layers/window_data_layer.s
-
-# target to generate assembly for a file
-layers/window_data_layer.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/layers/window_data_layer.cpp.s
-.PHONY : layers/window_data_layer.cpp.s
-
-net.o: net.cpp.o
-.PHONY : net.o
-
-# target to build an object file
-net.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.o
-.PHONY : net.cpp.o
-
-net.i: net.cpp.i
-.PHONY : net.i
-
-# target to preprocess a source file
-net.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.i
-.PHONY : net.cpp.i
-
-net.s: net.cpp.s
-.PHONY : net.s
-
-# target to generate assembly for a file
-net.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/net.cpp.s
-.PHONY : net.cpp.s
-
-solver.o: solver.cpp.o
-.PHONY : solver.o
-
-# target to build an object file
-solver.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.o
-.PHONY : solver.cpp.o
-
-solver.i: solver.cpp.i
-.PHONY : solver.i
-
-# target to preprocess a source file
-solver.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.i
-.PHONY : solver.cpp.i
-
-solver.s: solver.cpp.s
-.PHONY : solver.s
-
-# target to generate assembly for a file
-solver.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/solver.cpp.s
-.PHONY : solver.cpp.s
-
-syncedmem.o: syncedmem.cpp.o
-.PHONY : syncedmem.o
-
-# target to build an object file
-syncedmem.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.o
-.PHONY : syncedmem.cpp.o
-
-syncedmem.i: syncedmem.cpp.i
-.PHONY : syncedmem.i
-
-# target to preprocess a source file
-syncedmem.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.i
-.PHONY : syncedmem.cpp.i
-
-syncedmem.s: syncedmem.cpp.s
-.PHONY : syncedmem.s
-
-# target to generate assembly for a file
-syncedmem.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/syncedmem.cpp.s
-.PHONY : syncedmem.cpp.s
-
-util/benchmark.o: util/benchmark.cpp.o
-.PHONY : util/benchmark.o
-
-# target to build an object file
-util/benchmark.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.o
-.PHONY : util/benchmark.cpp.o
-
-util/benchmark.i: util/benchmark.cpp.i
-.PHONY : util/benchmark.i
-
-# target to preprocess a source file
-util/benchmark.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.i
-.PHONY : util/benchmark.cpp.i
-
-util/benchmark.s: util/benchmark.cpp.s
-.PHONY : util/benchmark.s
-
-# target to generate assembly for a file
-util/benchmark.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/benchmark.cpp.s
-.PHONY : util/benchmark.cpp.s
-
-util/cudnn.o: util/cudnn.cpp.o
-.PHONY : util/cudnn.o
-
-# target to build an object file
-util/cudnn.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.o
-.PHONY : util/cudnn.cpp.o
-
-util/cudnn.i: util/cudnn.cpp.i
-.PHONY : util/cudnn.i
-
-# target to preprocess a source file
-util/cudnn.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.i
-.PHONY : util/cudnn.cpp.i
-
-util/cudnn.s: util/cudnn.cpp.s
-.PHONY : util/cudnn.s
-
-# target to generate assembly for a file
-util/cudnn.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/cudnn.cpp.s
-.PHONY : util/cudnn.cpp.s
-
-util/db.o: util/db.cpp.o
-.PHONY : util/db.o
-
-# target to build an object file
-util/db.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.o
-.PHONY : util/db.cpp.o
-
-util/db.i: util/db.cpp.i
-.PHONY : util/db.i
-
-# target to preprocess a source file
-util/db.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.i
-.PHONY : util/db.cpp.i
-
-util/db.s: util/db.cpp.s
-.PHONY : util/db.s
-
-# target to generate assembly for a file
-util/db.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db.cpp.s
-.PHONY : util/db.cpp.s
-
-util/db_leveldb.o: util/db_leveldb.cpp.o
-.PHONY : util/db_leveldb.o
-
-# target to build an object file
-util/db_leveldb.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.o
-.PHONY : util/db_leveldb.cpp.o
-
-util/db_leveldb.i: util/db_leveldb.cpp.i
-.PHONY : util/db_leveldb.i
-
-# target to preprocess a source file
-util/db_leveldb.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.i
-.PHONY : util/db_leveldb.cpp.i
-
-util/db_leveldb.s: util/db_leveldb.cpp.s
-.PHONY : util/db_leveldb.s
-
-# target to generate assembly for a file
-util/db_leveldb.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_leveldb.cpp.s
-.PHONY : util/db_leveldb.cpp.s
-
-util/db_lmdb.o: util/db_lmdb.cpp.o
-.PHONY : util/db_lmdb.o
-
-# target to build an object file
-util/db_lmdb.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.o
-.PHONY : util/db_lmdb.cpp.o
-
-util/db_lmdb.i: util/db_lmdb.cpp.i
-.PHONY : util/db_lmdb.i
-
-# target to preprocess a source file
-util/db_lmdb.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.i
-.PHONY : util/db_lmdb.cpp.i
-
-util/db_lmdb.s: util/db_lmdb.cpp.s
-.PHONY : util/db_lmdb.s
-
-# target to generate assembly for a file
-util/db_lmdb.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/db_lmdb.cpp.s
-.PHONY : util/db_lmdb.cpp.s
-
-util/im2col.o: util/im2col.cpp.o
-.PHONY : util/im2col.o
-
-# target to build an object file
-util/im2col.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.o
-.PHONY : util/im2col.cpp.o
-
-util/im2col.i: util/im2col.cpp.i
-.PHONY : util/im2col.i
-
-# target to preprocess a source file
-util/im2col.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.i
-.PHONY : util/im2col.cpp.i
-
-util/im2col.s: util/im2col.cpp.s
-.PHONY : util/im2col.s
-
-# target to generate assembly for a file
-util/im2col.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/im2col.cpp.s
-.PHONY : util/im2col.cpp.s
-
-util/insert_splits.o: util/insert_splits.cpp.o
-.PHONY : util/insert_splits.o
-
-# target to build an object file
-util/insert_splits.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.o
-.PHONY : util/insert_splits.cpp.o
-
-util/insert_splits.i: util/insert_splits.cpp.i
-.PHONY : util/insert_splits.i
-
-# target to preprocess a source file
-util/insert_splits.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.i
-.PHONY : util/insert_splits.cpp.i
-
-util/insert_splits.s: util/insert_splits.cpp.s
-.PHONY : util/insert_splits.s
-
-# target to generate assembly for a file
-util/insert_splits.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/insert_splits.cpp.s
-.PHONY : util/insert_splits.cpp.s
-
-util/io.o: util/io.cpp.o
-.PHONY : util/io.o
-
-# target to build an object file
-util/io.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.o
-.PHONY : util/io.cpp.o
-
-util/io.i: util/io.cpp.i
-.PHONY : util/io.i
-
-# target to preprocess a source file
-util/io.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.i
-.PHONY : util/io.cpp.i
-
-util/io.s: util/io.cpp.s
-.PHONY : util/io.s
-
-# target to generate assembly for a file
-util/io.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/io.cpp.s
-.PHONY : util/io.cpp.s
-
-util/math_functions.o: util/math_functions.cpp.o
-.PHONY : util/math_functions.o
-
-# target to build an object file
-util/math_functions.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.o
-.PHONY : util/math_functions.cpp.o
-
-util/math_functions.i: util/math_functions.cpp.i
-.PHONY : util/math_functions.i
-
-# target to preprocess a source file
-util/math_functions.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.i
-.PHONY : util/math_functions.cpp.i
-
-util/math_functions.s: util/math_functions.cpp.s
-.PHONY : util/math_functions.s
-
-# target to generate assembly for a file
-util/math_functions.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/math_functions.cpp.s
-.PHONY : util/math_functions.cpp.s
-
-util/ocl_util.o: util/ocl_util.cpp.o
-.PHONY : util/ocl_util.o
-
-# target to build an object file
-util/ocl_util.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.o
-.PHONY : util/ocl_util.cpp.o
-
-util/ocl_util.i: util/ocl_util.cpp.i
-.PHONY : util/ocl_util.i
-
-# target to preprocess a source file
-util/ocl_util.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.i
-.PHONY : util/ocl_util.cpp.i
-
-util/ocl_util.s: util/ocl_util.cpp.s
-.PHONY : util/ocl_util.s
-
-# target to generate assembly for a file
-util/ocl_util.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_util.cpp.s
-.PHONY : util/ocl_util.cpp.s
-
-util/ocl_wrapper.o: util/ocl_wrapper.cpp.o
-.PHONY : util/ocl_wrapper.o
-
-# target to build an object file
-util/ocl_wrapper.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.o
-.PHONY : util/ocl_wrapper.cpp.o
-
-util/ocl_wrapper.i: util/ocl_wrapper.cpp.i
-.PHONY : util/ocl_wrapper.i
-
-# target to preprocess a source file
-util/ocl_wrapper.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.i
-.PHONY : util/ocl_wrapper.cpp.i
-
-util/ocl_wrapper.s: util/ocl_wrapper.cpp.s
-.PHONY : util/ocl_wrapper.s
-
-# target to generate assembly for a file
-util/ocl_wrapper.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/ocl_wrapper.cpp.s
-.PHONY : util/ocl_wrapper.cpp.s
-
-util/upgrade_proto.o: util/upgrade_proto.cpp.o
-.PHONY : util/upgrade_proto.o
-
-# target to build an object file
-util/upgrade_proto.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.o
-.PHONY : util/upgrade_proto.cpp.o
-
-util/upgrade_proto.i: util/upgrade_proto.cpp.i
-.PHONY : util/upgrade_proto.i
-
-# target to preprocess a source file
-util/upgrade_proto.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.i
-.PHONY : util/upgrade_proto.cpp.i
-
-util/upgrade_proto.s: util/upgrade_proto.cpp.s
-.PHONY : util/upgrade_proto.s
-
-# target to generate assembly for a file
-util/upgrade_proto.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/CMakeFiles/caffe.dir/build.make src/caffe/CMakeFiles/caffe.dir/util/upgrade_proto.cpp.s
-.PHONY : util/upgrade_proto.cpp.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... caffe"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... proto"
-	@echo "... rebuild_cache"
-	@echo "... __/__/include/caffe/proto/caffe.pb.o"
-	@echo "... __/__/include/caffe/proto/caffe.pb.i"
-	@echo "... __/__/include/caffe/proto/caffe.pb.s"
-	@echo "... blob.o"
-	@echo "... blob.i"
-	@echo "... blob.s"
-	@echo "... common.o"
-	@echo "... common.i"
-	@echo "... common.s"
-	@echo "... data_transformer.o"
-	@echo "... data_transformer.i"
-	@echo "... data_transformer.s"
-	@echo "... device.o"
-	@echo "... device.i"
-	@echo "... device.s"
-	@echo "... internal_thread.o"
-	@echo "... internal_thread.i"
-	@echo "... internal_thread.s"
-	@echo "... layer_factory.o"
-	@echo "... layer_factory.i"
-	@echo "... layer_factory.s"
-	@echo "... layers/absval_layer.o"
-	@echo "... layers/absval_layer.i"
-	@echo "... layers/absval_layer.s"
-	@echo "... layers/accuracy_layer.o"
-	@echo "... layers/accuracy_layer.i"
-	@echo "... layers/accuracy_layer.s"
-	@echo "... layers/argmax_layer.o"
-	@echo "... layers/argmax_layer.i"
-	@echo "... layers/argmax_layer.s"
-	@echo "... layers/base_conv_layer.o"
-	@echo "... layers/base_conv_layer.i"
-	@echo "... layers/base_conv_layer.s"
-	@echo "... layers/base_data_layer.o"
-	@echo "... layers/base_data_layer.i"
-	@echo "... layers/base_data_layer.s"
-	@echo "... layers/bnll_layer.o"
-	@echo "... layers/bnll_layer.i"
-	@echo "... layers/bnll_layer.s"
-	@echo "... layers/concat_layer.o"
-	@echo "... layers/concat_layer.i"
-	@echo "... layers/concat_layer.s"
-	@echo "... layers/contrastive_loss_layer.o"
-	@echo "... layers/contrastive_loss_layer.i"
-	@echo "... layers/contrastive_loss_layer.s"
-	@echo "... layers/conv_layer.o"
-	@echo "... layers/conv_layer.i"
-	@echo "... layers/conv_layer.s"
-	@echo "... layers/cudnn_conv_layer.o"
-	@echo "... layers/cudnn_conv_layer.i"
-	@echo "... layers/cudnn_conv_layer.s"
-	@echo "... layers/cudnn_pooling_layer.o"
-	@echo "... layers/cudnn_pooling_layer.i"
-	@echo "... layers/cudnn_pooling_layer.s"
-	@echo "... layers/cudnn_relu_layer.o"
-	@echo "... layers/cudnn_relu_layer.i"
-	@echo "... layers/cudnn_relu_layer.s"
-	@echo "... layers/cudnn_sigmoid_layer.o"
-	@echo "... layers/cudnn_sigmoid_layer.i"
-	@echo "... layers/cudnn_sigmoid_layer.s"
-	@echo "... layers/cudnn_softmax_layer.o"
-	@echo "... layers/cudnn_softmax_layer.i"
-	@echo "... layers/cudnn_softmax_layer.s"
-	@echo "... layers/cudnn_tanh_layer.o"
-	@echo "... layers/cudnn_tanh_layer.i"
-	@echo "... layers/cudnn_tanh_layer.s"
-	@echo "... layers/data_layer.o"
-	@echo "... layers/data_layer.i"
-	@echo "... layers/data_layer.s"
-	@echo "... layers/deconv_layer.o"
-	@echo "... layers/deconv_layer.i"
-	@echo "... layers/deconv_layer.s"
-	@echo "... layers/dropout_layer.o"
-	@echo "... layers/dropout_layer.i"
-	@echo "... layers/dropout_layer.s"
-	@echo "... layers/dummy_data_layer.o"
-	@echo "... layers/dummy_data_layer.i"
-	@echo "... layers/dummy_data_layer.s"
-	@echo "... layers/eltwise_layer.o"
-	@echo "... layers/eltwise_layer.i"
-	@echo "... layers/eltwise_layer.s"
-	@echo "... layers/euclidean_loss_layer.o"
-	@echo "... layers/euclidean_loss_layer.i"
-	@echo "... layers/euclidean_loss_layer.s"
-	@echo "... layers/exp_layer.o"
-	@echo "... layers/exp_layer.i"
-	@echo "... layers/exp_layer.s"
-	@echo "... layers/filter_layer.o"
-	@echo "... layers/filter_layer.i"
-	@echo "... layers/filter_layer.s"
-	@echo "... layers/flatten_layer.o"
-	@echo "... layers/flatten_layer.i"
-	@echo "... layers/flatten_layer.s"
-	@echo "... layers/hdf5_data_layer.o"
-	@echo "... layers/hdf5_data_layer.i"
-	@echo "... layers/hdf5_data_layer.s"
-	@echo "... layers/hdf5_output_layer.o"
-	@echo "... layers/hdf5_output_layer.i"
-	@echo "... layers/hdf5_output_layer.s"
-	@echo "... layers/hinge_loss_layer.o"
-	@echo "... layers/hinge_loss_layer.i"
-	@echo "... layers/hinge_loss_layer.s"
-	@echo "... layers/im2col_layer.o"
-	@echo "... layers/im2col_layer.i"
-	@echo "... layers/im2col_layer.s"
-	@echo "... layers/image_data_layer.o"
-	@echo "... layers/image_data_layer.i"
-	@echo "... layers/image_data_layer.s"
-	@echo "... layers/infogain_loss_layer.o"
-	@echo "... layers/infogain_loss_layer.i"
-	@echo "... layers/infogain_loss_layer.s"
-	@echo "... layers/inner_product_layer.o"
-	@echo "... layers/inner_product_layer.i"
-	@echo "... layers/inner_product_layer.s"
-	@echo "... layers/log_layer.o"
-	@echo "... layers/log_layer.i"
-	@echo "... layers/log_layer.s"
-	@echo "... layers/loss_layer.o"
-	@echo "... layers/loss_layer.i"
-	@echo "... layers/loss_layer.s"
-	@echo "... layers/lrn_layer.o"
-	@echo "... layers/lrn_layer.i"
-	@echo "... layers/lrn_layer.s"
-	@echo "... layers/memory_data_layer.o"
-	@echo "... layers/memory_data_layer.i"
-	@echo "... layers/memory_data_layer.s"
-	@echo "... layers/multinomial_logistic_loss_layer.o"
-	@echo "... layers/multinomial_logistic_loss_layer.i"
-	@echo "... layers/multinomial_logistic_loss_layer.s"
-	@echo "... layers/mvn_layer.o"
-	@echo "... layers/mvn_layer.i"
-	@echo "... layers/mvn_layer.s"
-	@echo "... layers/neuron_layer.o"
-	@echo "... layers/neuron_layer.i"
-	@echo "... layers/neuron_layer.s"
-	@echo "... layers/pooling_layer.o"
-	@echo "... layers/pooling_layer.i"
-	@echo "... layers/pooling_layer.s"
-	@echo "... layers/power_layer.o"
-	@echo "... layers/power_layer.i"
-	@echo "... layers/power_layer.s"
-	@echo "... layers/prelu_layer.o"
-	@echo "... layers/prelu_layer.i"
-	@echo "... layers/prelu_layer.s"
-	@echo "... layers/reduction_layer.o"
-	@echo "... layers/reduction_layer.i"
-	@echo "... layers/reduction_layer.s"
-	@echo "... layers/relu_layer.o"
-	@echo "... layers/relu_layer.i"
-	@echo "... layers/relu_layer.s"
-	@echo "... layers/reshape_layer.o"
-	@echo "... layers/reshape_layer.i"
-	@echo "... layers/reshape_layer.s"
-	@echo "... layers/sigmoid_cross_entropy_loss_layer.o"
-	@echo "... layers/sigmoid_cross_entropy_loss_layer.i"
-	@echo "... layers/sigmoid_cross_entropy_loss_layer.s"
-	@echo "... layers/sigmoid_layer.o"
-	@echo "... layers/sigmoid_layer.i"
-	@echo "... layers/sigmoid_layer.s"
-	@echo "... layers/silence_layer.o"
-	@echo "... layers/silence_layer.i"
-	@echo "... layers/silence_layer.s"
-	@echo "... layers/slice_layer.o"
-	@echo "... layers/slice_layer.i"
-	@echo "... layers/slice_layer.s"
-	@echo "... layers/softmax_layer.o"
-	@echo "... layers/softmax_layer.i"
-	@echo "... layers/softmax_layer.s"
-	@echo "... layers/softmax_loss_layer.o"
-	@echo "... layers/softmax_loss_layer.i"
-	@echo "... layers/softmax_loss_layer.s"
-	@echo "... layers/split_layer.o"
-	@echo "... layers/split_layer.i"
-	@echo "... layers/split_layer.s"
-	@echo "... layers/spp_layer.o"
-	@echo "... layers/spp_layer.i"
-	@echo "... layers/spp_layer.s"
-	@echo "... layers/tanh_layer.o"
-	@echo "... layers/tanh_layer.i"
-	@echo "... layers/tanh_layer.s"
-	@echo "... layers/threshold_layer.o"
-	@echo "... layers/threshold_layer.i"
-	@echo "... layers/threshold_layer.s"
-	@echo "... layers/window_data_layer.o"
-	@echo "... layers/window_data_layer.i"
-	@echo "... layers/window_data_layer.s"
-	@echo "... net.o"
-	@echo "... net.i"
-	@echo "... net.s"
-	@echo "... solver.o"
-	@echo "... solver.i"
-	@echo "... solver.s"
-	@echo "... syncedmem.o"
-	@echo "... syncedmem.i"
-	@echo "... syncedmem.s"
-	@echo "... util/benchmark.o"
-	@echo "... util/benchmark.i"
-	@echo "... util/benchmark.s"
-	@echo "... util/cudnn.o"
-	@echo "... util/cudnn.i"
-	@echo "... util/cudnn.s"
-	@echo "... util/db.o"
-	@echo "... util/db.i"
-	@echo "... util/db.s"
-	@echo "... util/db_leveldb.o"
-	@echo "... util/db_leveldb.i"
-	@echo "... util/db_leveldb.s"
-	@echo "... util/db_lmdb.o"
-	@echo "... util/db_lmdb.i"
-	@echo "... util/db_lmdb.s"
-	@echo "... util/im2col.o"
-	@echo "... util/im2col.i"
-	@echo "... util/im2col.s"
-	@echo "... util/insert_splits.o"
-	@echo "... util/insert_splits.i"
-	@echo "... util/insert_splits.s"
-	@echo "... util/io.o"
-	@echo "... util/io.i"
-	@echo "... util/io.s"
-	@echo "... util/math_functions.o"
-	@echo "... util/math_functions.i"
-	@echo "... util/math_functions.s"
-	@echo "... util/ocl_util.o"
-	@echo "... util/ocl_util.i"
-	@echo "... util/ocl_util.s"
-	@echo "... util/ocl_wrapper.o"
-	@echo "... util/ocl_wrapper.i"
-	@echo "... util/ocl_wrapper.s"
-	@echo "... util/upgrade_proto.o"
-	@echo "... util/upgrade_proto.i"
-	@echo "... util/upgrade_proto.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake b/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 7bb0014c..00000000
--- a/src/gtest/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# Relative path conversion top directories.
-SET(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-SET(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/yugao/caffe-merge-junli/caffe-yb/caffe")
-
-# Force unix paths in dependencies.
-SET(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-SET(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-SET(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-SET(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-SET(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake b/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
deleted file mode 100644
index 76e46409..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake
+++ /dev/null
@@ -1,32 +0,0 @@
-# The set of languages for which implicit dependencies are needed:
-SET(CMAKE_DEPENDS_LANGUAGES
-  "CXX"
-  )
-# The set of files for implicit dependencies of each language:
-SET(CMAKE_DEPENDS_CHECK_CXX
-  "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp" "/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o"
-  )
-SET(CMAKE_CXX_COMPILER_ID "GNU")
-
-# Preprocessor definitions for this target.
-SET(CMAKE_TARGET_DEFINITIONS
-  "GTEST_USE_OWN_TR1_TUPLE"
-  )
-
-# Targets to which this target links.
-SET(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# The include file search paths:
-SET(CMAKE_C_TARGET_INCLUDE_PATH
-  "src"
-  "/usr/local/include"
-  "include"
-  "/usr/local/cuda/include"
-  "/usr/local/include/opencv"
-  "/usr/include/atlas"
-  "."
-  )
-SET(CMAKE_CXX_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_Fortran_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
-SET(CMAKE_ASM_TARGET_INCLUDE_PATH ${CMAKE_C_TARGET_INCLUDE_PATH})
diff --git a/src/gtest/CMakeFiles/gtest.dir/build.make b/src/gtest/CMakeFiles/gtest.dir/build.make
deleted file mode 100644
index b41ed414..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/build.make
+++ /dev/null
@@ -1,106 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# Include any dependencies generated for this target.
-include src/gtest/CMakeFiles/gtest.dir/depend.make
-
-# Include the progress variables for this target.
-include src/gtest/CMakeFiles/gtest.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include src/gtest/CMakeFiles/gtest.dir/flags.make
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/CMakeFiles/gtest.dir/flags.make
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o: src/gtest/gtest-all.cpp
-	$(CMAKE_COMMAND) -E cmake_progress_report /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles $(CMAKE_PROGRESS_1)
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Building CXX object src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++   $(CXX_DEFINES) $(CXX_FLAGS) -o CMakeFiles/gtest.dir/gtest-all.cpp.o -c /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/gtest.dir/gtest-all.cpp.i"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -E /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp > CMakeFiles/gtest.dir/gtest-all.cpp.i
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/gtest.dir/gtest-all.cpp.s"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && /usr/bin/c++  $(CXX_DEFINES) $(CXX_FLAGS) -S /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/gtest-all.cpp -o CMakeFiles/gtest.dir/gtest-all.cpp.s
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires:
-.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
-	$(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build
-.PHONY : src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides
-
-src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.provides.build: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
-
-# Object files for target gtest
-gtest_OBJECTS = \
-"CMakeFiles/gtest.dir/gtest-all.cpp.o"
-
-# External object files for target gtest
-gtest_EXTERNAL_OBJECTS =
-
-lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
-lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/build.make
-lib/libgtest.a: src/gtest/CMakeFiles/gtest.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --red --bold "Linking CXX static library ../../lib/libgtest.a"
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean_target.cmake
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/gtest.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-src/gtest/CMakeFiles/gtest.dir/build: lib/libgtest.a
-.PHONY : src/gtest/CMakeFiles/gtest.dir/build
-
-src/gtest/CMakeFiles/gtest.dir/requires: src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o.requires
-.PHONY : src/gtest/CMakeFiles/gtest.dir/requires
-
-src/gtest/CMakeFiles/gtest.dir/clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest && $(CMAKE_COMMAND) -P CMakeFiles/gtest.dir/cmake_clean.cmake
-.PHONY : src/gtest/CMakeFiles/gtest.dir/clean
-
-src/gtest/CMakeFiles/gtest.dir/depend:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/gtest.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : src/gtest/CMakeFiles/gtest.dir/depend
-
diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
deleted file mode 100644
index 694feb83..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-FILE(REMOVE_RECURSE
-  "CMakeFiles/gtest.dir/gtest-all.cpp.o"
-  "../../lib/libgtest.pdb"
-  "../../lib/libgtest.a"
-)
-
-# Per-language clean rules from dependency scanning.
-FOREACH(lang CXX)
-  INCLUDE(CMakeFiles/gtest.dir/cmake_clean_${lang}.cmake OPTIONAL)
-ENDFOREACH(lang)
diff --git a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake b/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
deleted file mode 100644
index 2c9ec14f..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/cmake_clean_target.cmake
+++ /dev/null
@@ -1,3 +0,0 @@
-FILE(REMOVE_RECURSE
-  "../../lib/libgtest.a"
-)
diff --git a/src/gtest/CMakeFiles/gtest.dir/depend.make b/src/gtest/CMakeFiles/gtest.dir/depend.make
deleted file mode 100644
index 37ac348d..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for gtest.
-# This may be replaced when dependencies are built.
diff --git a/src/gtest/CMakeFiles/gtest.dir/flags.make b/src/gtest/CMakeFiles/gtest.dir/flags.make
deleted file mode 100644
index 8b4ef992..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/flags.make
+++ /dev/null
@@ -1,8 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# compile CXX with /usr/bin/c++
-CXX_FLAGS =  -fPIC -Wall -Wno-sign-compare -Wno-uninitialized -O3 -DNDEBUG -I/home/yugao/caffe-merge-junli/caffe-yb/caffe/src -isystem /usr/local/include -isystem /home/yugao/caffe-merge-junli/caffe-yb/caffe/include -isystem /usr/local/cuda/include -isystem /usr/local/include/opencv -isystem /usr/include/atlas -I/home/yugao/caffe-merge-junli/caffe-yb/caffe   
-
-CXX_DEFINES = -DGTEST_USE_OWN_TR1_TUPLE
-
diff --git a/src/gtest/CMakeFiles/gtest.dir/link.txt b/src/gtest/CMakeFiles/gtest.dir/link.txt
deleted file mode 100644
index e5645cfb..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/link.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-/usr/bin/ar cr ../../lib/libgtest.a  CMakeFiles/gtest.dir/gtest-all.cpp.o
-/usr/bin/ranlib ../../lib/libgtest.a
diff --git a/src/gtest/CMakeFiles/gtest.dir/progress.make b/src/gtest/CMakeFiles/gtest.dir/progress.make
deleted file mode 100644
index 143c9b1b..00000000
--- a/src/gtest/CMakeFiles/gtest.dir/progress.make
+++ /dev/null
@@ -1,2 +0,0 @@
-CMAKE_PROGRESS_1 = 65
-
diff --git a/src/gtest/CMakeFiles/progress.marks b/src/gtest/CMakeFiles/progress.marks
deleted file mode 100644
index 573541ac..00000000
--- a/src/gtest/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-0
diff --git a/src/gtest/Makefile b/src/gtest/Makefile
deleted file mode 100644
index d1a96ceb..00000000
--- a/src/gtest/Makefile
+++ /dev/null
@@ -1,212 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 2.8
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Remove some rules from gmake that .SUFFIXES does not remove.
-SUFFIXES =
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E remove -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The program to use to edit the cache.
-CMAKE_EDIT_COMMAND = /usr/bin/ccmake
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
-	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: install/local
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: install/strip
-.PHONY : install/strip/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest/CMakeFiles/progress.marks
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-src/gtest/CMakeFiles/gtest.dir/rule:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/gtest/CMakeFiles/gtest.dir/rule
-.PHONY : src/gtest/CMakeFiles/gtest.dir/rule
-
-# Convenience name for target.
-gtest: src/gtest/CMakeFiles/gtest.dir/rule
-.PHONY : gtest
-
-# fast build rule for target.
-gtest/fast:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/build
-.PHONY : gtest/fast
-
-gtest-all.o: gtest-all.cpp.o
-.PHONY : gtest-all.o
-
-# target to build an object file
-gtest-all.cpp.o:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.o
-.PHONY : gtest-all.cpp.o
-
-gtest-all.i: gtest-all.cpp.i
-.PHONY : gtest-all.i
-
-# target to preprocess a source file
-gtest-all.cpp.i:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.i
-.PHONY : gtest-all.cpp.i
-
-gtest-all.s: gtest-all.cpp.s
-.PHONY : gtest-all.s
-
-# target to generate assembly for a file
-gtest-all.cpp.s:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/gtest/CMakeFiles/gtest.dir/build.make src/gtest/CMakeFiles/gtest.dir/gtest-all.cpp.s
-.PHONY : gtest-all.cpp.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... gtest"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... gtest-all.o"
-	@echo "... gtest-all.i"
-	@echo "... gtest-all.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/src/gtest/cmake_install.cmake b/src/gtest/cmake_install.cmake
deleted file mode 100644
index 14c33dd5..00000000
--- a/src/gtest/cmake_install.cmake
+++ /dev/null
@@ -1,34 +0,0 @@
-# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/gtest
-
-# Set the install prefix
-IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
-ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
-STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  IF(BUILD_TYPE)
-    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  ELSE(BUILD_TYPE)
-    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
-  ENDIF(BUILD_TYPE)
-  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-
-# Set the component getting installed.
-IF(NOT CMAKE_INSTALL_COMPONENT)
-  IF(COMPONENT)
-    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
-    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  ELSE(COMPONENT)
-    SET(CMAKE_INSTALL_COMPONENT)
-  ENDIF(COMPONENT)
-ENDIF(NOT CMAKE_INSTALL_COMPONENT)
-
-# Install shared libraries without execute permission?
-IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  SET(CMAKE_INSTALL_SO_NO_EXE "1")
-ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-

From 6b9fe7a5719828d3935f8729c9824e67aecce6b7 Mon Sep 17 00:00:00 2001
From: Junli Gu <junli.gu@amd.com>
Date: Thu, 17 Sep 2015 14:28:14 -0700
Subject: [PATCH 114/124] integrate Mauricio's code review suggestions

---
 include/caffe/util/im2col.hpp      | 30 -----------
 src/caffe/layers/dropout_layer.cpp |  2 +
 src/caffe/ocl/im2col.cl            | 75 +++-----------------------
 src/caffe/ocl/random.cl            | 36 ++++---------
 src/caffe/util/im2col.cpp          | 84 +-----------------------------
 src/caffe/util/math_functions.cpp  | 21 +++-----
 6 files changed, 26 insertions(+), 222 deletions(-)

diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 531b11ad..327d7415 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -52,26 +52,6 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     Dtype* data_col, const int col_offset);
 
-template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col);
-
-template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels, const int height,
-    const int width, const int patch_h, const int patch_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im);
-
-template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_col, const int col_offset);
-
-template <typename Dtype>
-void im2col_16_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_col, const int col_offset);
-
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
     const int channels, const int height, const int width, const int ksize,
@@ -88,16 +68,6 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
     const int channels, const int height, const int width, const int ksize,
     const int pad, const int stride, Dtype* data_im, const int img_offset,
     int optnum);
-
-template <typename Dtype>
-void col2im_gpu_ocl(cl_mem data_col, const int channels, const int height,
-    const int width, const int ksize, const int pad, const int stride,
-    Dtype* data_im, cl_kernel Kernel);
-
-template <typename Dtype>
-void im2col_gpu_ocl(cl_mem data_im, const int channels, const int height,
-    const int width, const int ksize, const int pad, const int stride,
-    Dtype* data_col, cl_kernel Kernel);
 #endif
 }  // namespace caffe
 
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 2cb50ead..905ed6ec 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -82,6 +82,7 @@ void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     // NOLINT_NEXT_LINE(whitespace/operators)
     DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data);
   } else {
+    if(bottom_data != top_data)
     caffe_gpu_copy(count, bottom_data, top_data);
   }
 }
@@ -99,6 +100,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       // NOLINT_NEXT_LINE(whitespace/operators)
       DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff);
     } else {
+      if(bottom_diff != top_diff)
       caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
     }
   }
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 09f240cf..d03463ee 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -23,39 +23,6 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  **************************************************************************************/
-
-template <class T>
-__kernel void im2col(const int n, __global T* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset) {
-  int index=get_global_id(0);
-  data_im = data_im + img_offset;
-  data_col = data_col + col_offset;
-  if(index < n) {
-    int w_out=index %width_col;
-    index /= width_col;
-    int h_out=index%height_col;
-    int channel_in = index/height_col;
-    int channel_out=channel_in *ksize *ksize;
-    int h_in = h_out *stride-pad;
-    int w_in = w_out *stride-pad;
-    data_col +=(channel_out *height_col + h_out) *width_col + w_out;
-    data_im +=(channel_in * height + h_in) *width + w_in;
-    int i=0,j=0;
-    for(i=0;i<ksize;++i) {
-      for(j=0;j<ksize;++j) {
-        int h = h_in+i;
-        int w = w_in+j;
-        if(h >= 0 && w >= 0 && h < height && w < width)
-        *data_col=data_im[i * width + j];
-        else *data_col=0;
-        data_col +=height_col *width_col;
-      }
-    }
-  }
-}
-
-template __attribute__((mangled_name(im2colfloat))) __kernel void im2col(const int n, __global float* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2coldouble))) __kernel void im2col(const int n, __global double* data_im, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset);
-
 template <class T>
 __kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
 
@@ -93,7 +60,7 @@ template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_op
 template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum);
 
 template <class T>
-__kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const int img_offset,
+__kernel void im2col(const int n, __global const T* data_im, const int img_offset,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
@@ -127,17 +94,17 @@ __kernel void im2col_gpu_kernel(const int n, __global const T* data_im, const in
   }
 }
 
-template __attribute__((mangled_name(im2col_gpu_kernel_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
+template __attribute__((mangled_name(im2col_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
     const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2col_gpu_kernel_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+template __attribute__((mangled_name(im2col_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
     const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int height_col, const int width_col, __global double* data_col, const int col_offset);
 
 template <class T>
-__kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const int col_offset,
+__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
     const int height, const int width, const int channels,
     const int patch_h, const int patch_w,
     const int pad_h, const int pad_w,
@@ -171,46 +138,16 @@ __kernel void col2im_gpu_kernel(const int n, __global const T* data_col, const i
   }
 }
 
-template __attribute__((mangled_name(col2im_gpu_kernel_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
+template __attribute__((mangled_name(col2im_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
     const int height, const int width, const int channels,
     const int patch_h, const int patch_w,const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,const int height_col, const int width_col,
     __global float* data_im, const int img_offset);
-template __attribute__((mangled_name(col2im_gpu_kernel_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
+template __attribute__((mangled_name(col2im_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
     const int col_offset, const int height, const int width, const int channels,
     const int patch_h, const int patch_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
-template <class T>
-__kernel void col2im(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset) {
-  int index = get_global_id(0);
-  data_col = data_col + col_offset;
-  data_im = data_im + img_offset;
-  if(index < n) {
-    T val = 0;
-    int w = index % width + pad;
-    int h = (index / width) % height + pad;
-    int c = index / (width * height);
-    // compute the start and end of the output
-    int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-    int w_col_end = min(w / stride + 1, width_col);
-    int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-    int h_col_end = min(h / stride + 1, height_col);
-    // equivalent implementation
-    int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
-    int coeff_w_col = (1 - stride * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
-      }
-    }
-    data_im[index] = val;
-  }
-}
-template __attribute__((mangled_name(col2imfloat))) __kernel void col2im(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset);
-template __attribute__((mangled_name(col2imdouble))) __kernel void col2im(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset);
-
 template <class T>
 __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
   int index = get_global_id(0);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
index 7f8bc5b3..468240f0 100644
--- a/src/caffe/ocl/random.cl
+++ b/src/caffe/ocl/random.cl
@@ -1,33 +1,12 @@
-/*************************************************************************************
- * Copyright (c) 2015, Advanced Micro Devices, Inc.  
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this 
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice, 
- * this list of conditions and the following disclaimer in the documentation and/or
- *  other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- **************************************************************************************/
 
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 
-//beginning of the looooooong gpu_random_generator kernel 
-//we use the open sourced threefry's GPU implementation
+//Note: random generator has two parts
+//first part: the open sourced threefy random generator kernel from DE Shaw Research
+//second part. we wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators.
+
+//begin: the open sourced random generator from DE Shaw Research
+//https://www.deshawresearch.com/resources_random123.html
 typedef uint uint32_t;
 
 struct r123array4x32 {
@@ -803,6 +782,7 @@ inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
   }
   return X;
 }
+//end: the open sourced random generator from DE Shaw Research
 
 template <class T>
 __kernel void PRNG_threefry4x32_bernoulli(
@@ -847,6 +827,8 @@ template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_t
 
 //end of the looooooong gpu_random_generator kernel 
 
+//We wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators.
+
 template <class T>
 __kernel void PRNG_threefry4x32_uniform(
     __global float4 *randomnumber,
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 89985534..6899d15a 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -153,7 +153,7 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     Dtype* data_col, const int col_offset) {
-  std::string kernel_name = "im2col_gpu_kernel" + get_dtype_suffix<Dtype>();
+  std::string kernel_name = "im2col" + get_dtype_suffix<Dtype>();
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
@@ -200,7 +200,7 @@ void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
     const int width,  const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     Dtype* data_im, const int img_offset) {
-  std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
+  std::string kernel_name = "col2im" + get_dtype_suffix<Dtype>();
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
@@ -241,46 +241,6 @@ template void col2im_gpu<double>(const double* data_col, const int col_offset,
     const int patch_w, const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im, const int img_offset);
 
-template <typename Dtype>
-void im2col_gpu(cl_kernel Kernel, const Dtype* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_col, const int col_offset) {
-
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
-  int num_kernels = channels * height_col * width_col;
-
-  cl_int ret;
-  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
-  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
-  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &ksize);
-  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pad);
-  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &stride);
-  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &height_col);
-  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &width_col);
-  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &data_col);
-  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &col_offset);
-
-  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-  size_t uiLocal_Work_Size[] = { 256 };
-  OCL_CHECK(
-      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-  clFinish(amdDevice.CommandQueue);
-}
-
-template void im2col_gpu<float>(cl_kernel Kernel, const float* data_im,
-    const int img_offset, const int channels, const int height, const int width,
-    const int ksize, const int pad, const int stride, float* data_col,
-    const int col_offset);
-template void im2col_gpu<double>(cl_kernel Kernel, const double* data_im,
-    const int img_offset, const int channels, const int height, const int width,
-    const int ksize, const int pad, const int stride, double* data_col,
-    const int col_offset);
-
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
     const int channels, const int height, const int width, const int ksize,
@@ -327,45 +287,5 @@ template void im2col_gpu_opt<double>(const double* data_im,
     const int ksize, const int pad, const int stride, double* data_col,
     const int col_offset, int optnum);
 
-template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels,
-    const int height, const int width, const int ksize, const int pad,
-    const int stride, Dtype* data_im, const int img_offset) {
-  std::string kernel_name = "col2im_gpu_kernel" + get_dtype_suffix<Dtype>();
-  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
-  int num_kernels = channels * height * width;
-
-  cl_int ret;
-  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
-  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
-  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
-  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
-  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
-  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
-  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
-  OCL_CHECK(ret);
-
-  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
-  size_t uiLocal_Work_Size[] = { 256 };
-  OCL_CHECK(
-      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
-          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
-}
-
-template void col2im_gpu<float>(const float* data_col, const int col_offset,
-    const int channels, const int height, const int width, const int psize,
-    const int pad, const int stride, float* data_im, const int img_offset);
-template void col2im_gpu<double>(const double* data_col, const int col_offset,
-    const int channels, const int height, const int width, const int psize,
-    const int pad, const int stride, double* data_im, const int img_offset);
 #endif
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index aebeb5ed..93af3e23 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -463,7 +463,6 @@ void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   int ldc = N;
-  //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
   CLBLAS_CHECK(
       clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
           (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
@@ -640,10 +639,12 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
 
 template <>
 void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
+  NOT_IMPLEMENTED;
 }
 
 template <>
 void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
+  NOT_IMPLEMENTED;
 }
 
 template <>
@@ -656,19 +657,10 @@ void caffe_gpu_abs<double>(const int n, const double* x, double* y) {
   caffe_gpu_abs_ocl(n, x, y);
 }
 
-
-//template <typename Dtype>
 void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) {
   clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
       NULL, NULL);
-// OCL_CHECK(clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem)X, (cl_mem)Y, 0, 0, N, 0, NULL, NULL));
 }
-/*
- template void caffe_gpu_memcpy<long>(const size_t N, const long* X, long* Y);
- template void caffe_gpu_memcpy<unsigned long>(const size_t N, const unsigned long* X, unsigned long* Y);
- template void caffe_gpu_memcpy<int>(const size_t N, const int* X, int* Y);
- template void caffe_gpu_memcpy<unsigned int>(const size_t N, const unsigned int* X, unsigned int* Y);
- */
 template <>
 void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y) {
   OCL_CHECK(
@@ -766,7 +758,6 @@ template <>
 void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
     double * out) {
   //need to pass in scratchBuff
-  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
   cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
       (n * sizeof(double)), NULL, NULL);
   cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
@@ -796,7 +787,6 @@ void caffe_gpu_dot<float>(const int n, const float* x, size_t offx, const float*
 template <>
 void caffe_gpu_dot<double>(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) {
   //need to pass in scratchBuff
-  //AMDBLAS_CHECK(clAmdBlasDdot(n, out, 0, x, 0, 1, y, 0, 1, scratch_buf, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
   cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
       (n * sizeof(double)), NULL, NULL);
   cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
@@ -896,6 +886,7 @@ void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
 
 template <typename Dtype>
 void set_kernel(const int n, const Dtype alpha, Dtype* y) {
+  NOT_IMPLEMENTED;
 }
 
 template <>
@@ -1003,21 +994,23 @@ void caffe_gpu_powx<double>(const int N, const double* a, const double alpha,
 }
 
 void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) {
+  NOT_IMPLEMENTED;
 }
 
 void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) {
+  NOT_IMPLEMENTED;
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
     const float* y) {
-  return 0;
+  NOT_IMPLEMENTED;
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
     const double* y) {
-  return 0;
+  NOT_IMPLEMENTED;
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {

From efd5dba821070af28951b90293792d528b590f70 Mon Sep 17 00:00:00 2001
From: Junli Gu <gujunli@gmail.com>
Date: Thu, 17 Sep 2015 14:57:01 -0700
Subject: [PATCH 115/124] Update README.md

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index a1bf49d6..ebc83a1a 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,27 @@
-#OpenCL caffe
+#OpenCL Caffe
 
-This is an OpenCL implementation of caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
+This is an OpenCL implementation of Caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete Caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
 
 OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. 
 
 #Design features
-  -All caffe layers ported to OpenCL
+  -All Caffe layers ported to OpenCL
 
   -Performance improvement by batched implementation for conv layer based on clBLAS
 
-  -User can choose optimal batch number depending on H/W, image size and minibatch size
+  -The user can choose the optimal batch number depending on H/W properties, image size and minibatch size
 
   -Supports OpenCL 2.0, 1.2
   
-  -Implemented in C++ and OpenCL, maintaining the same interfaces as original caffe to make it easy for caffe users
+  -Implemented in C++ and OpenCL, maintaining the same interfaces as the original Caffe
 
   -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19
 
-Note: More features will be added in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered to be added in the future.
+Note: More features are planned in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered for future addition.
 
 #Performance
 
-We will keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved.
+We intend to keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved.
 
 * Training speed (Model: AlexNet, minibatch size 128)
 
@@ -48,7 +48,7 @@ We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical sug
  As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together.
 
 #License
-Original caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license.
+The original Caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license.
 
 # Original Caffe information
 ## Caffe

From 660df235de85a15e1cd4482e35706334b3e6016a Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 18 Sep 2015 15:27:25 +0800
Subject: [PATCH 116/124] comment on where the code is modified for OpenCL port

---
 src/caffe/layers/absval_layer.cpp           | 3 ++-
 src/caffe/layers/bnll_layer.cpp             | 3 ++-
 src/caffe/layers/concat_layer.cpp           | 3 ++-
 src/caffe/layers/contrastive_loss_layer.cpp | 3 ++-
 src/caffe/layers/conv_layer.cpp             | 4 ++--
 src/caffe/layers/deconv_layer.cpp           | 2 ++
 src/caffe/layers/dropout_layer.cpp          | 3 ++-
 src/caffe/layers/eltwise_layer.cpp          | 3 ++-
 src/caffe/layers/euclidean_loss_layer.cpp   | 3 ++-
 src/caffe/layers/exp_layer.cpp              | 3 ++-
 src/caffe/layers/filter_layer.cpp           | 3 ++-
 src/caffe/layers/hdf5_data_layer.cpp        | 3 ++-
 src/caffe/layers/hdf5_output_layer.cpp      | 3 ++-
 src/caffe/layers/im2col_layer.cpp           | 2 ++
 src/caffe/layers/inner_product_layer.cpp    | 2 ++
 src/caffe/layers/log_layer.cpp              | 2 ++
 src/caffe/layers/lrn_layer.cpp              | 2 ++
 src/caffe/layers/mvn_layer.cpp              | 2 ++
 src/caffe/layers/pooling_layer.cpp          | 5 ++---
 src/caffe/layers/power_layer.cpp            | 4 ++--
 src/caffe/layers/prelu_layer.cpp            | 2 ++
 src/caffe/layers/reduction_layer.cpp        | 2 ++
 src/caffe/layers/relu_layer.cpp             | 2 ++
 src/caffe/layers/sigmoid_layer.cpp          | 2 ++
 src/caffe/layers/silence_layer.cpp          | 2 ++
 src/caffe/layers/slice_layer.cpp            | 3 ++-
 src/caffe/layers/softmax_layer.cpp          | 4 ++--
 src/caffe/layers/softmax_loss_layer.cpp     | 4 ++--
 src/caffe/layers/split_layer.cpp            | 4 ++--
 src/caffe/layers/tanh_layer.cpp             | 2 ++
 src/caffe/layers/threshold_layer.cpp        | 2 ++
 31 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 6e06b558..20898f15 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -36,7 +36,7 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -57,6 +57,7 @@ void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(AbsValLayer);
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index ed9cc1d4..68a19265 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -39,7 +39,7 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -62,6 +62,7 @@ void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     BNLLBackward(count, top_diff, bottom_data, bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(BNLLLayer);
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 5cceb9ff..5def30d4 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -95,7 +95,7 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -139,6 +139,7 @@ void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     offset_concat_axis += bottom_concat_axis;
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(ConcatLayer);
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 6dda7d61..3410b927 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -100,7 +100,7 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_gpu(
@@ -156,6 +156,7 @@ void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(ContrastiveLossLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index b64eb1aa..30da288a 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -69,7 +69,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -227,7 +227,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
   }
 
 }
-// end: code written/modified by AMD
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(ConvolutionLayer);
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 5b0eeb03..ddf906b7 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -70,6 +70,7 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -124,6 +125,7 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else 
 STUB_GPU(DeconvolutionLayer);
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index 905ed6ec..21699414 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -67,7 +67,7 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code is written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -105,6 +105,7 @@ void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(DropoutLayer);
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 971703f4..84cc279c 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -154,7 +154,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -241,6 +241,7 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(EltwiseLayer);
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 2130c6f4..ea78484b 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -44,7 +44,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -72,6 +72,7 @@ void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(EuclideanLossLayer);
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index 3fe7cde4..ad40bb1b 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -61,7 +61,7 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -95,6 +95,7 @@ void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_scal(count, inner_scale_, bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(ExpLayer);
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index 2cd9957d..884764b4 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -117,7 +117,7 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -178,6 +178,7 @@ void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+// end: code modified for OpenCL port
 #else
 STUB_GPU(FilterLayer);
 #endif
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 28eee444..c87304b0 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -158,7 +158,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -197,6 +197,7 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU_FORWARD(HDF5DataLayer, Forward);
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index 11d01647..0005fb94 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -68,7 +68,7 @@ void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -103,6 +103,7 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   return;
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(HDF5OutputLayer);
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 38e1fd20..36245446 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -88,6 +88,7 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -112,6 +113,7 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         bottom[0]->offset(n));
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(Im2colLayer);
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index b40e3e7d..cfa4246a 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -120,6 +120,7 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -157,6 +158,7 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(InnerProductLayer);
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 5dbbca74..a01c9c18 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -80,6 +80,7 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -126,6 +127,7 @@ void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   }
   caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(LogLayer);
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index da3d1fc3..0c91435b 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -252,6 +252,7 @@ void LRNLayer<Dtype>::WithinChannelBackward(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void LRNLayer<Dtype>::CrossChannelForward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -310,6 +311,7 @@ void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "Unknown normalization region.";
   }
 }
+// end: code modified for OpenCL port
 #else
 STUB_GPU(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 2c4acb14..d64f5670 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -123,6 +123,7 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -228,6 +229,7 @@ void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else 
 STUB_GPU(MVNLayer);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 0becf164..812ffbb3 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -314,7 +314,7 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -408,8 +408,7 @@ void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     LOG(FATAL) << "Unknown pooling method.";
   }
 }
-
-// end: code written/modified by AMD
+// end: code modified for OpenCL port
 #else
 STUB_GPU(PoolingLayer);
 #endif
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index a0f5ccee..c3cb1759 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -95,8 +95,8 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -169,7 +169,7 @@ void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
   }
 }
-// end: code written/modified by AMD
+// begin: code modified for OpenCL port
 #else
 STUB_GPU(PowerLayer);
 #endif
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 75aa3968..55f2e303 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -129,6 +129,7 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -197,6 +198,7 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         slope_data, div_factor);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(PReLULayer);
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 0358d83a..ace74b28 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -125,6 +125,7 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -212,6 +213,7 @@ void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     ++top_diff;
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(ReductionLayer);
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index 132d7b4b..3d2eaf2e 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -35,6 +35,7 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -57,6 +58,7 @@ void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope);
   }
 }
+// end: code modified for OpenCL port
 
 #else 
 STUB_GPU(ReLULayer);
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 737bff74..b820e8ff 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -40,6 +40,7 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -63,6 +64,7 @@ void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     SigmoidBackward(count, top_diff, top_data, bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(SigmoidLayer);
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index a6c30fbb..4436584b 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -17,6 +17,7 @@ void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -33,6 +34,7 @@ void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(SilenceLayer);
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index cd19fdb5..de21e936 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -112,6 +112,7 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -151,7 +152,7 @@ void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     offset_slice_axis += top_slice_axis;
   }
 }
-
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(SliceLayer);
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 366946bd..1269b058 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -91,7 +91,7 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -148,7 +148,7 @@ void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
 }
-// end: code written/modified by AMD
+// end: code modified for OpenCL port
 #else
 STUB_GPU(SoftmaxLayer);
 #endif
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 2241bd6c..ef03ec7e 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -124,7 +124,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 #ifndef CPU_ONLY
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
@@ -191,7 +191,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
   }
 }
-// end: code written/modified by AMD
+// end: code modified for OpenCL port
 #else
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 57677b5b..e92f7bf2 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -60,7 +60,7 @@ void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
@@ -80,7 +80,7 @@ void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
   }
 }
-// end: code written/modified by AMD
+// begin: code modified for OpenCL port
 #else
 STUB_GPU(SplitLayer);
 #endif
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index 7a15809d..f62092b2 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -38,6 +38,7 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -60,6 +61,7 @@ void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     TanHBackward(count, top_diff, top_data, bottom_diff);
   }
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU(TanHLayer);
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index a4c543ee..eebc379a 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -25,6 +25,7 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 }
 
 #ifndef CPU_ONLY
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -34,6 +35,7 @@ void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   // NOLINT_NEXT_LINE(whitespace/operators)
   ThresholdForward(count, threshold_, bottom_data, top_data);
 }
+// end: code modified for OpenCL port
 
 #else
 STUB_GPU_FORWARD(ThresholdLayer, Forward);

From ab0b360f331ecc0289d7ae06a773fa6ec220200f Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 18 Sep 2015 15:30:10 +0800
Subject: [PATCH 117/124] Go through 1x1 convolution

---
 include/caffe/common.hpp                  |  2 +-
 src/caffe/layers/base_conv_layer.cpp      | 18 +++++++++---------
 src/caffe/layers/conv_layer.cpp           |  2 +-
 src/caffe/test/test_convolution_layer.cpp |  3 ++-
 src/caffe/util/math_functions.cpp         | 12 ++++++------
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 7aed6007..0b455c59 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -82,7 +82,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1
+#define use_packing_scheme 0
 /* global_packing_N defines packing number of the use_packing scheme
  for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 04cd38dd..6c66ac12 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -291,15 +291,15 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
       conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
     }
     col_buff = col_buffer_.gpu_data();
-  }
+  } 
 
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm < Dtype
-        > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_
+     caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_
             / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_
-            * g, col_buff, col_offset_ * g, (Dtype) 0., output, top_offset_
+            * g, col_buff, is_1x1_ * bottom_offset_ + col_offset_ * g, (Dtype) 0., output, top_offset_
             + output_offset_ * g);
   }
+  
 }
 
 template <typename Dtype>
@@ -316,13 +316,14 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
   if (is_1x1_) {
     col_buff = input;
   }
+ 
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm < Dtype
-        > (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+      caffe_gpu_gemm < Dtype> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
             / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_
-            * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, col_offset_
+            * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, is_1x1_ * bottom_offset_ + col_offset_
             * g);
   }
+  
   if (!is_1x1_) {
     conv_col2im_gpu(col_buff, input);
   }
@@ -339,8 +340,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
   for (int g = 0; g < group_; ++g) {
     caffe_gpu_gemm < Dtype
         > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
-            / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_, (Dtype*) col_buff, col_offset_
-            * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g);
+            / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_ + output_offset_*g, (Dtype*) col_buff, is_1x1_*bottom_offset_ + col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g);
   }
 }
 
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index b64eb1aa..ed9950c4 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -124,7 +124,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu_org(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
+   const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     //CHECK_BLOB_DATA(bottom[i],10,"bottom");
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 67d41fff..576095c1 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -122,10 +122,11 @@ class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
   }
 
   virtual ~ConvolutionLayerTest() {
-    delete blob_bottom_;
+   /* delete blob_bottom_;
     delete blob_bottom_2_;
     delete blob_top_;
     delete blob_top_2_;
+   */
   }
 
   virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index aebeb5ed..4f7e5c03 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -483,8 +483,8 @@ void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
   int ldb = (TransB == CblasNoTrans) ? N : K;
   int ldc = N;
   CLBLAS_CHECK(
-      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K,  alpha,
+          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda,  beta, (cl_mem) C,
           0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
 }
 
@@ -523,8 +523,8 @@ cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
   int ldb = (TransB == CblasNoTrans) ? N : K;
   int ldc = N;
   CLBLAS_CHECK(
-      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta,
           (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL,
           &event));
   return event;
@@ -566,8 +566,8 @@ cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
   int ldb = (TransB == CblasNoTrans) ? N : K;
   int ldc = N;
   CLBLAS_CHECK(
-      clblasDgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
-          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K,  alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta,
           (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event));
   return event;
 }

From 3acadc06d0966bfe1329787f39443b82ef3bb658 Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Fri, 18 Sep 2015 20:53:14 +0800
Subject: [PATCH 118/124] Go through conv_layer

---
 include/caffe/util/ocl_util.hpp      |  3 +--
 include/caffe/util/ocl_wrapper.hpp   |  2 +-
 src/caffe/blob.cpp                   |  4 +--
 src/caffe/common.cpp                 |  1 +
 src/caffe/layers/base_conv_layer.cpp |  4 +--
 src/caffe/layers/conv_layer.cpp      |  2 --
 src/caffe/ocl/im2col.cl              |  8 +++---
 src/caffe/syncedmem.cpp              |  2 +-
 src/caffe/util/math_functions.cpp    | 40 +++++++++++++++++++---------
 src/caffe/util/ocl_util.cpp          |  4 ++-
 src/caffe/util/ocl_wrapper.cpp       | 14 ++++++----
 11 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
index dcdf1057..3027019f 100644
--- a/include/caffe/util/ocl_util.hpp
+++ b/include/caffe/util/ocl_util.hpp
@@ -32,8 +32,7 @@ namespace caffe {
 template <typename Dtype>
 void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0);
 
-void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
-    const int count);
+void ocl_memset(cl_mem buffer, const int value, const int count);
 
 void eventCallback(cl_event event, cl_int event_status, void * user_data);
 #endif
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 61d6162e..308292c8 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -218,7 +218,7 @@ void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
 void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0);
 
 template <typename Dtype>
-void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup);
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int  _seed = 0);
 
 template <typename Dtype>
 void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V);
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 089899fc..ece07d14 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -426,10 +426,10 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
   switch (Caffe::mode()) {
   case Caffe::GPU:
     if (copy_diff) {
-      caffe_copy(count_, source.gpu_diff(),
+      caffe_gpu_copy(count_, source.gpu_diff(),
           static_cast<Dtype*>(diff_->mutable_gpu_data()));
     } else {
-      caffe_copy(count_, source.gpu_data(),
+      caffe_gpu_copy(count_, source.gpu_data(),
           static_cast<Dtype*>(data_->mutable_gpu_data()));
     }
     break;
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index f42a4204..9ed4207a 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -105,6 +105,7 @@ void Caffe::set_random_seed(const unsigned int seed) {
 	// RNG seed
 	Get().random_generator_.reset(new RNG(seed));
         caffe_gpu_uniform(0, NULL, seed);
+        caffe_gpu_uniform((float*)NULL, 0, (float)0.0, (float)1.0, seed);
 }
 
 void Caffe::SetDevice(const int device_id) {
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 6c66ac12..a233e6c9 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -224,8 +224,7 @@ void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
     col_buff = col_buffer_.cpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm < Dtype
-        > (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
+    caffe_cpu_gemm <Dtype> (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
             / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff
             + col_offset_ * g, (Dtype) 0., output + output_offset_ * g);
   }
@@ -292,7 +291,6 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
     }
     col_buff = col_buffer_.gpu_data();
   } 
-
   for (int g = 0; g < group_; ++g) {
      caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_
             / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c720bbb9..27777122 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -127,8 +127,6 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(
    const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
-    //CHECK_BLOB_DATA(bottom[i],10,"bottom");
-
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
       //two intermediate variables to pass offset
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index d03463ee..2d9032db 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -94,11 +94,11 @@ __kernel void im2col(const int n, __global const T* data_im, const int img_offse
   }
 }
 
-template __attribute__((mangled_name(im2col_float))) void im2col_gpu_kernel<float>(const int n, __global const float* data_im,
+template __attribute__((mangled_name(im2col_float))) void im2col<float>(const int n, __global const float* data_im,
     const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int height_col, const int width_col, __global float* data_col, const int col_offset);
-template __attribute__((mangled_name(im2col_double))) void im2col_gpu_kernel<double>(const int n, __global const double* data_im,
+template __attribute__((mangled_name(im2col_double))) void im2col<double>(const int n, __global const double* data_im,
     const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int height_col, const int width_col, __global double* data_col, const int col_offset);
@@ -138,12 +138,12 @@ __kernel void col2im(const int n, __global const T* data_col, const int col_offs
   }
 }
 
-template __attribute__((mangled_name(col2im_float))) __kernel void col2im_gpu_kernel(const int n, __global const float* data_col, const int col_offset,
+template __attribute__((mangled_name(col2im_float))) __kernel void col2im(const int n, __global const float* data_col, const int col_offset,
     const int height, const int width, const int channels,
     const int patch_h, const int patch_w,const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,const int height_col, const int width_col,
     __global float* data_im, const int img_offset);
-template __attribute__((mangled_name(col2im_double))) __kernel void col2im_gpu_kernel(const int n, __global const double* data_col,
+template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const int n, __global const double* data_col,
     const int col_offset, const int height, const int width, const int channels,
     const int patch_h, const int patch_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index a3fa9973..76d3f2ea 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -115,7 +115,7 @@ inline void SyncedMemory::to_gpu() {
       fprintf(stderr, "Failed to create memory object\n");
       break;
     }
-    ocl_memset(oclmem_kernel, tmpMem, (int) 0, (int) (size_ / sizeof(int)));
+    ocl_memset(tmpMem, (int) 0, (int) (size_ / sizeof(int)));
     gpu_ptr_ = (void*) tmpMem;
     head_ = HEAD_AT_GPU;
     break;
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index cfd7709e..155aac45 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -110,6 +110,16 @@ void caffe_set(const int N, const double alpha, double* Y) {
   }
 }
 
+template <>
+void caffe_copy<float>(const int N, const float* X, float* Y) {
+  cblas_scopy(N, X, 1, Y, 1);
+}
+
+template <>
+void caffe_copy<double>(const int N, const double* X, double* Y) {
+  cblas_dcopy(N, X, 1, Y, 1);
+}
+
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
   for (int i = 0; i < N; ++i) {
@@ -124,16 +134,6 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) {
   }
 }
 
-template <>
-void caffe_copy<float>(const int N, const float* X, float* Y) {
-  cblas_scopy(N, X, 1, Y, 1);
-}
-
-template <>
-void caffe_copy<double>(const int N, const double* X, double* Y) {
-  cblas_dcopy(N, X, 1, Y, 1);
-}
-
 template <>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
   cblas_sscal(N, alpha, X, 1);
@@ -226,13 +226,14 @@ void caffe_log<double>(const int n, const double* a, double* y) {
   vdLn(n, a, y);
 }
 
+
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
   if (X != Y) {
     if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
       // NOLINT_NEXT_LINE(caffe/alt_fn)
-      //CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+     // caffe_gpu_copy(N, X, Y);
 #else
       NO_GPU;
 #endif
@@ -242,6 +243,7 @@ void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
   }
 }
 
+
 template void caffe_copy<int>(const int N, const int* X, int* Y);
 template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
     unsigned int* Y);
@@ -675,6 +677,20 @@ void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y) {
           N, 0, NULL, NULL));
 }
 
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+     OCL_CHECK(
+       clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N * sizeof(Dtype), 0, NULL, NULL));
+  }
+}
+template void caffe_gpu_copy<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_copy<double>(const int N, const double* X, double* Y);
+template void caffe_gpu_copy<int>(const int N, const int* X, int* Y);
+template void caffe_gpu_copy<unsigned int>(const int N, const unsigned int* X, unsigned int* Y);
+
+/*
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
   if (X != Y) {
@@ -692,7 +708,7 @@ void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
             &(amdDevice.CommandQueue), 0, NULL, NULL));
   }
 }
-
+*/
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, const int offx, float* Y, const int offy) {
   if (X != Y) {
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
index 0b151e5a..bc2aea35 100644
--- a/src/caffe/util/ocl_util.cpp
+++ b/src/caffe/util/ocl_util.cpp
@@ -60,8 +60,10 @@ template void ocl_memset<int>(int* buffer, const int value, const int count, con
 template void ocl_memset<float>(float* buffer, const float value, const int count, const int buf_offset);
 template void ocl_memset<double>(double* buffer, const double value, const int count, const int buf_offset);
 
-void ocl_memset(cl_kernel Kernel, cl_mem buffer, const int value,
+void ocl_memset(cl_mem buffer, const int value,
     const int count) {
+  std::string kernel_name = std::string("OCL_memset2");
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
   cl_int err;
   err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
   err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 7ffadc72..5bb4f2fa 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -120,12 +120,16 @@ template void get_max_gpu<double>(cl_kernel Kernel, const int num,
 		const int dim, const double* bottom_data, double* scale_data);
 
 template <typename Dtype>
-void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int seed_)
 {
-        std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
+        static unsigned c = 0;
+        if ((n == 0) || (a == NULL)) {
+            c = seed_;
+            return;
+        }
+	std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
         cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
 
-        static unsigned c = 0;
         unsigned nrounds = 20;
         array4x32  rndctr4;
         rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
@@ -144,8 +148,8 @@ void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup)
         size_t localws[1] = {256};
         OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
 }
-template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup);
-template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup);
+template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup, unsigned int seed_);
+template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup, unsigned int seed_);
 
 void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed)
 {

From 606117dde3ebe9a2b402a195e2d6201b6b133aba Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sat, 19 Sep 2015 00:08:22 +0800
Subject: [PATCH 119/124] Go through GPUMathFunctionsTest

---
 include/caffe/common.hpp               |  2 +-
 include/caffe/util/ocl_wrapper.hpp     |  3 ++
 include/caffe/vision_layers.hpp        |  4 +--
 src/caffe/layers/conv_layer.cpp        |  8 ++---
 src/caffe/ocl/util.cl                  | 11 +++++++
 src/caffe/solver.cpp                   |  2 +-
 src/caffe/test/test_math_functions.cpp |  2 +-
 src/caffe/util/math_functions.cpp      | 41 ++++----------------------
 src/caffe/util/ocl_wrapper.cpp         | 18 +++++++++++
 9 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 0b455c59..7aed6007 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -82,7 +82,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 0
+#define use_packing_scheme 1
 /* global_packing_N defines packing number of the use_packing scheme
  for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
index 308292c8..0ce3a184 100644
--- a/include/caffe/util/ocl_wrapper.hpp
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -226,6 +226,9 @@ void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V);
 template <typename Dtype>
 void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
 
+template <typename Dtype>
+void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y);
+
 template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index e2a9b190..c3a73014 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -237,9 +237,9 @@ class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
         const vector<Blob<Dtype>*>& top);
     virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-    virtual void Forward_gpu_opt2(const vector<Blob<Dtype>*>& bottom,
+    virtual void Forward_gpu_batched(const vector<Blob<Dtype>*>& bottom,
         const vector<Blob<Dtype>*>& top);
-    virtual void Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+    virtual void Backward_gpu_batched(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 #endif
 };
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 27777122..4f0175e0 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -75,7 +75,7 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   if (use_packing_scheme && global_packing_N > 1)
-    Forward_gpu_opt2(bottom, top);
+    Forward_gpu_batched(bottom, top);
   else
     Forward_gpu_org(bottom, top);
 }
@@ -84,13 +84,13 @@ template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (use_packing_scheme && global_packing_N > 1)
-    Backward_gpu_opt2(top, propagate_down, bottom);
+    Backward_gpu_batched(top, propagate_down, bottom);
   else
     Backward_gpu_org(top, propagate_down, bottom);
 }
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu_opt2(
+void ConvolutionLayer<Dtype>::Forward_gpu_batched(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
@@ -146,7 +146,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(
 }
 
 template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu_opt2(const vector<Blob<Dtype>*>& top,
+void ConvolutionLayer<Dtype>::Backward_gpu_batched(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
index eced284b..222e4ed9 100644
--- a/src/caffe/ocl/util.cl
+++ b/src/caffe/ocl/util.cl
@@ -57,6 +57,17 @@ __kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) {
 template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
 template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
 
+template <class T>
+__kernel void caffe_gpu_sgnbit(const int N, __global T* X, __global T* Y) {
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =(X[gdx] < 0.0);
+  }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sgnbit_float))) __kernel void caffe_gpu_sgnbit(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sgnbit_double))) __kernel void caffe_gpu_sgnbit(const int N, __global double* X, __global double* Y);
+
 template <class T>
 __kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx,  __global T* Y, const int offy) {
   X += offx;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 20af4160..b9ed1050 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -677,7 +677,7 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   case Caffe::GPU: {
 #ifndef CPU_ONLY
     // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
+    caffe_gpu_copy(net_params[param_id]->count(),
         this->history_[param_id]->gpu_data(),
         this->update_[param_id]->mutable_gpu_data());
 
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index a095b544..a0f88065 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -232,7 +232,7 @@ TYPED_TEST(GPUMathFunctionsTest, TestCopy) {
   const int n = this->blob_bottom_->count();
   const TypeParam* bottom_data = this->blob_bottom_->gpu_data();
   TypeParam* top_data = this->blob_top_->mutable_gpu_data();
-  caffe_copy(n, bottom_data, top_data);
+  caffe_gpu_copy(n, bottom_data, top_data);
   bottom_data = this->blob_bottom_->cpu_data();
   top_data = this->blob_top_->mutable_cpu_data();
   for (int i = 0; i < n; ++i) {
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 155aac45..e45fd564 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -110,6 +110,7 @@ void caffe_set(const int N, const double alpha, double* Y) {
   }
 }
 
+/*
 template <>
 void caffe_copy<float>(const int N, const float* X, float* Y) {
   cblas_scopy(N, X, 1, Y, 1);
@@ -119,7 +120,7 @@ template <>
 void caffe_copy<double>(const int N, const double* X, double* Y) {
   cblas_dcopy(N, X, 1, Y, 1);
 }
-
+*/
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
   for (int i = 0; i < N; ++i) {
@@ -208,7 +209,7 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
     return;
   }
   for (int i = 0; i < N; ++i) {
-    Y[i] = alpha;
+	    Y[i] = alpha;
   }
 }
 
@@ -226,24 +227,13 @@ void caffe_log<double>(const int n, const double* a, double* y) {
   vdLn(n, a, y);
 }
 
-
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
   if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-     // caffe_gpu_copy(N, X, Y);
-#else
-      NO_GPU;
-#endif
-    } else {
       memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
   }
 }
 
-
 template void caffe_copy<int>(const int N, const int* X, int* Y);
 template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
     unsigned int* Y);
@@ -387,7 +377,7 @@ void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
 
 template void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
 template void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
-//
+
 template <>
 float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);
@@ -641,12 +631,12 @@ void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
 
 template <>
 void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
-  NOT_IMPLEMENTED;
+  caffe_gpu_signbit(n, x, y);
 }
 
 template <>
 void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
-  NOT_IMPLEMENTED;
+  caffe_gpu_signbit(n, x, y);
 }
 
 template <>
@@ -690,25 +680,6 @@ template void caffe_gpu_copy<double>(const int N, const double* X, double* Y);
 template void caffe_gpu_copy<int>(const int N, const int* X, int* Y);
 template void caffe_gpu_copy<unsigned int>(const int N, const unsigned int* X, unsigned int* Y);
 
-/*
-template <>
-void caffe_gpu_copy<float>(const int N, const float* X, float* Y) {
-  if (X != Y) {
-    CLBLAS_CHECK(
-        clblasScopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-            &(amdDevice.CommandQueue), 0, NULL, NULL));
-  }
-}
-
-template <>
-void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
-  if (X != Y) {
-    CLBLAS_CHECK(
-        clblasDcopy(N, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
-            &(amdDevice.CommandQueue), 0, NULL, NULL));
-  }
-}
-*/
 template <>
 void caffe_gpu_copy<float>(const int N, const float* X, const int offx, float* Y, const int offy) {
   if (X != Y) {
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
index 5bb4f2fa..0b4cbf6f 100644
--- a/src/caffe/util/ocl_wrapper.cpp
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -1568,6 +1568,24 @@ template void caffe_gpu_add<float>(const int n, const float* in1,
 template void caffe_gpu_add<double>(const int n, const double* in1,
     const double* in2, double* y);
 
+template <typename Dtype>
+void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y) {
+  std::string kernel_name = "caffe_gpu_sgnbit" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void caffe_gpu_signbit<float>(const int N, const float* X, float * Y);
+template void caffe_gpu_signbit<double>(const int N, const double* X, double * Y);
+
 template <typename Dtype>
 void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
   std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();

From ecbd837091ef06a6f0ab2a1a2e1b1308fe965e83 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sat, 19 Sep 2015 14:00:51 +0800
Subject: [PATCH 120/124] fixed im2col_opt paramters

---
 include/caffe/util/im2col.hpp   |  8 ++--
 include/caffe/vision_layers.hpp |  6 ++-
 src/caffe/ocl/im2col.cl         | 46 ++++++++++++----------
 src/caffe/util/im2col.cpp       | 69 +++++++++++++++++++--------------
 4 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 327d7415..9c6de363 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -54,8 +54,8 @@ void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
 
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset,
     int optnum);
 
 template <typename Dtype>
@@ -65,8 +65,8 @@ void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
 
 template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset,
     int optnum);
 #endif
 }  // namespace caffe
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index c3a73014..381b983b 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -117,12 +117,14 @@ class BaseConvolutionLayer: public Layer<Dtype> {
   protected:
     inline void conv_im2col_gpu_opt(const Dtype* data) {
       im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
-          conv_in_width_, kernel_w_, pad_w_, stride_h_, (Dtype*) transMem, 0,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, (Dtype*) transMem, 0,
           opt_num2);
     }
     inline void conv_col2im_gpu_opt(Dtype* data) {
       col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
-          conv_in_width_, kernel_h_, pad_h_, stride_w_, data, bottom_offset_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, data, bottom_offset_,
           opt_num2);
     }
   private:
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 2d9032db..26152470 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -24,7 +24,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  **************************************************************************************/
 template <class T>
-__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
+__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
 
   int index = get_global_id(0);
 
@@ -34,20 +34,20 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c
   int x_out = index % width_col;
   int y_out = (index / width_col) % height_col;
   int channel_in = (index / width_col / height_col) % channels;
-  int channel_out = channel_in * ksize * ksize;
+  int channel_out = channel_in * kernel_h * kernel_w;
   int im_id = index / width_col / height_col / channels;
 
-  int y_in = y_out * stride - pad;
-  int x_in = x_out * stride - pad;
+  int y_in = y_out * stride_h - pad_h;
+  int x_in = x_out * stride_w - pad_w;
   int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
   int offset_im = im_id * channels * height * width + channel_in * height * width;
 
-  for(int k_h = 0; k_h < ksize; k_h++) {
-    for(int k_w = 0; k_w < ksize; k_w++) {
+  for(int k_h = 0; k_h < kernel_h; k_h++) {
+    for(int k_w = 0; k_w < kernel_w; k_w++) {
       int x_im = x_in + k_w;
       int y_im = y_in + k_h;
       int index_im = y_im * width + x_im;
-      int index_col = (k_h * ksize + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+      int index_col = (k_h * kernel_h + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
       if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
       data_col[offset_col + index_col] = data_im[offset_im + index_im];
       else
@@ -56,8 +56,8 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c
   }
 }
 
-template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum);
-template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum);
+template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum);
+template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int tride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum);
 
 template <class T>
 __kernel void im2col(const int n, __global const T* data_im, const int img_offset,
@@ -149,25 +149,27 @@ template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const
     const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
 
 template <class T>
-__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
+__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, 
+const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
   int index = get_global_id(0);
   data_col = data_col + col_offset;
   data_im = data_im + img_offset;
   if(index < n) {
     T val = 0;
-    int w = index % width + pad;
-    int h = (index / width) % height + pad;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
     int c = index / (width * height) % channels;
     int im = index / width / height / channels;
     // compute the start and end of the output
-    int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
-    int w_col_end = min(w / stride + 1, width_col);
-    int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
-    int h_col_end = min(h / stride + 1, height_col);
+    int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
     // equivalent implementation
-    int offset = (c * ksize * ksize + h * ksize + w) * height_col * width_col * optnum + im * height_col * width_col;
-    int coeff_h_col = (1 - stride * ksize * height_col * optnum) * width_col;
-    int coeff_w_col = (1 - stride * height_col * width_col * optnum);
+    int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_h * height_col * optnum) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col * optnum);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
@@ -176,8 +178,10 @@ __kernel void col2im_opt(const int n, __global T* data_col, const int col_offset
     data_im[index] = val;
   }
 }
-template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum);
-template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int ksize, const int pad, const int stride, const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum);
+template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum);
+template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum);
 
 template <class T>
 __kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) {
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 6899d15a..241062a1 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -106,13 +106,13 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
 #ifndef CPU_ONLY
 template <typename Dtype>
 void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset,
     int optnum) {
   std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
   int num_kernels = channels * height * width;
 
   cl_int ret;
@@ -122,14 +122,17 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
   ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
   ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
   ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
-  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_im);
-  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &img_offset);
-  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum);
   OCL_CHECK(ret);
 
   size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
@@ -140,12 +143,14 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
 }
 
 template void col2im_gpu_opt<float>(const float* data_col, const int col_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, float* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im, const int img_offset,
     int optnum);
 template void col2im_gpu_opt<double>(const double* data_col,
     const int col_offset, const int channels, const int height, const int width,
-    const int ksize, const int pad, const int stride, double* data_im,
+   const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,   double* data_im,
     const int img_offset, int optnum);
 
 template <typename Dtype>
@@ -243,15 +248,15 @@ template void col2im_gpu<double>(const double* data_col, const int col_offset,
 
 template <typename Dtype>
 void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, Dtype* data_col, const int col_offset,
+    const int channels, const int height, const int width,const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset,
     int optnum) {
 
   std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
 
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
   int num_kernels = optnum * channels * height_col * width_col;
 
   cl_int ret;
@@ -261,14 +266,17 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
   ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset);
   ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
   ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
-  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &ksize);
-  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad);
-  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &stride);
-  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &height_col);
-  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &width_col);
-  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &data_col);
-  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &col_offset);
-  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &optnum);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum);
   OCL_CHECK(ret);
 
   size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
@@ -279,12 +287,13 @@ void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
 }
 
 template void im2col_gpu_opt<float>(const float* data_im, const int img_offset,
-    const int channels, const int height, const int width, const int ksize,
-    const int pad, const int stride, float* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col, const int col_offset,
     int optnum);
 template void im2col_gpu_opt<double>(const double* data_im,
     const int img_offset, const int channels, const int height, const int width,
-    const int ksize, const int pad, const int stride, double* data_col,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col,
     const int col_offset, int optnum);
 
 #endif

From 7ac0a963a9f8e48af895be4e684026d33d1be9f7 Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sat, 19 Sep 2015 14:55:44 +0800
Subject: [PATCH 121/124] fixed im2col

---
 src/caffe/ocl/im2col.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
index 26152470..f1a97eab 100644
--- a/src/caffe/ocl/im2col.cl
+++ b/src/caffe/ocl/im2col.cl
@@ -47,7 +47,7 @@ __kernel void im2col_opt(const int n, __global T* data_im, const int channels, c
       int x_im = x_in + k_w;
       int y_im = y_in + k_h;
       int index_im = y_im * width + x_im;
-      int index_col = (k_h * kernel_h + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+      int index_col = (k_h * kernel_w + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
       if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
       data_col[offset_col + index_col] = data_im[offset_im + index_im];
       else
@@ -168,7 +168,7 @@ const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
     int h_col_end = min(h / stride_h + 1, height_col);
     // equivalent implementation
     int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * kernel_h * height_col * optnum) * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * height_col * optnum) * width_col;
     int coeff_w_col = (1 - stride_w * height_col * width_col * optnum);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {

From a894a29847b4fe2282d8e817b682cf206e869e5e Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sun, 20 Sep 2015 01:14:36 +0800
Subject: [PATCH 122/124] direct is_1_1 conv to original scheme

---
 src/caffe/layers/base_conv_layer.cpp | 29 ++++------------------------
 src/caffe/layers/conv_layer.cpp      |  4 ++--
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index a233e6c9..e8241555 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -349,21 +349,16 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
       > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1);
 }
 
-// begin: code written/modified by AMD
+// begin: code modified for OpenCL port
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
     const Dtype* weight, Dtype* output, bool skip_im2col) {
   cl_command_queue Queue;
   const Dtype* col_buff = input;
-  if (!is_1x1_) {
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
     }
     col_buff = col_buffer_.gpu_data();
-  } else {
-    caffe_gpu_memcpy(K_ * N_ * opt_num2 * sizeof(Dtype), col_buff,
-        (Dtype*) transMem);
-  }
 #ifdef multiQ
   for (int g = 0; g < group_; ++g) {
     if(g == 0) Queue = amdDevice.CommandQueue;
@@ -402,11 +397,6 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
     const Dtype* weights, Dtype* input) {
   cl_command_queue Queue;
-  if (is_1x1_) {
-    caffe_gpu_memcpy(
-        height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype), input,
-        (Dtype*) transMem);
-  }
   for (int g = 0; g < group_; ++g) {
 #ifdef multiQ
     if(g == 0) Queue = amdDevice.CommandQueue;
@@ -426,26 +416,15 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
   }
 #endif
 
-  if (!is_1x1_) {
     conv_col2im_gpu_opt(input);
-  } else {
-    caffe_gpu_memcpy(
-        height_ * width_ * conv_in_channels_ * opt_num2 * sizeof(Dtype),
-        (Dtype*) transMem, input);
-  }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
     const Dtype* output, Dtype* weights) {
   cl_command_queue Queue;
-  if (!is_1x1_) {
-    conv_im2col_gpu_opt(input);
-  } else {
-    caffe_gpu_memcpy(K_ * N_ * group_ * opt_num2 * sizeof(Dtype), input,
-        (Dtype*) transMem);
-  }
-  opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+   conv_im2col_gpu_opt(input);
+   opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
       opt_num2);
 
   for (int g = 0; g < group_; ++g) {
@@ -468,7 +447,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
   }
 }
 
-// end: code is written/modified by AMD
+// end: code is modified for OpenCL
 #endif  // !CPU_ONLY
 
 INSTANTIATE_CLASS (BaseConvolutionLayer);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 4f0175e0..99897e67 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -74,7 +74,7 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if (use_packing_scheme && global_packing_N > 1)
+  if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
     Forward_gpu_batched(bottom, top);
   else
     Forward_gpu_org(bottom, top);
@@ -83,7 +83,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (use_packing_scheme && global_packing_N > 1)
+  if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
     Backward_gpu_batched(top, propagate_down, bottom);
   else
     Backward_gpu_org(top, propagate_down, bottom);

From 1511d4e6da461f8b05c24530785842863f9650ec Mon Sep 17 00:00:00 2001
From: Yibing <Yibing.Liu@amd.com>
Date: Sun, 20 Sep 2015 01:31:14 +0800
Subject: [PATCH 123/124] Removed unused variable in base_conv_layer

---
 Makefile.config                      | 7 +++++++
 src/caffe/layers/base_conv_layer.cpp | 4 +---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Makefile.config b/Makefile.config
index 829e2732..eea4c1f3 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -1,6 +1,13 @@
 ## Refer to http://caffe.berkeleyvision.org/installation.html
 # Contributions simplifying and improving our build system are welcome!
 
+# Use OpenCL
+  USE_OPENCL := 1
+# OpenCL directory
+  OCL_DIR := /opt/AMDAPPSDK-2.9-1
+# clBLAS directory
+  CLBLAS_DIR := /opt/clBLAS-2.1
+
 # cuDNN acceleration switch (uncomment to build with cuDNN).
 # USE_CUDNN := 1
 
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index e8241555..5d99e04d 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -353,12 +353,10 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
     const Dtype* weight, Dtype* output, bool skip_im2col) {
-  cl_command_queue Queue;
-  const Dtype* col_buff = input;
+    cl_command_queue Queue;
     if (!skip_im2col) {
       conv_im2col_gpu_opt(input);
     }
-    col_buff = col_buffer_.gpu_data();
 #ifdef multiQ
   for (int g = 0; g < group_; ++g) {
     if(g == 0) Queue = amdDevice.CommandQueue;

From 3318335404761a1e381f6daa64f0719e3e88e88c Mon Sep 17 00:00:00 2001
From: Junli <Junli.Gu@amd.com>
Date: Sun, 20 Sep 2015 14:16:35 +0800
Subject: [PATCH 124/124] pass col2im_opt unit test

---
 include/caffe/common.hpp                        |   2 +-
 .../test/.test_gradient_check_util.hpp.swo      | Bin 0 -> 16384 bytes
 src/caffe/layers/conv_layer.cpp                 |  16 ++++++++--------
 src/caffe/util/im2col.cpp                       |   2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)
 create mode 100644 include/caffe/test/.test_gradient_check_util.hpp.swo

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 7aed6007..0b455c59 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -82,7 +82,7 @@ private:\
 //the following are macro defines for optimization schmes in conv layer
 /*ifdef: use proposed img_packing scheme;
  ifndef: use proposed packing im2col + sgemm scheme*/
-#define use_packing_scheme 1
+#define use_packing_scheme 0
 /* global_packing_N defines packing number of the use_packing scheme
  for intial design, we use the same packing number for all conv layers*/
 #define global_packing_N 16
diff --git a/include/caffe/test/.test_gradient_check_util.hpp.swo b/include/caffe/test/.test_gradient_check_util.hpp.swo
new file mode 100644
index 0000000000000000000000000000000000000000..e3ebfc99f1458d91db2d1bab13d6b930f447c17f
GIT binary patch
literal 16384
zcmeHNO^h5z6)ubeBw&8R0VIA@_6n;P&)){5NbB{WWj$V7tQ~BxBfzm*HQhBcweIO|
zbyx3tjFTLYI3kb`mxL=qzzq-*7nB@w3KBmON^TqwmvG?%93b$$s_NO9+4b5;K|*r1
z^v!fvy{dZk)vNbj&s4wj?4`^6>DK2Nt`9Tz-Shvr@z{x<9s9vo8Jko?kx}^ruJz03
zo7Z~Rd+XOWE*>TNg6XgwiNQcN$Fdm8=1!F+%9Tv|?PqeCZfrHJM3tAikYe1_vea#r
zMoAS*E88|j^brmV5i6PMFdE6|c35eZv_^S;lpem$Gw^6KaDsjAsg(}Red*Lm{?R9I
zJeoB4S$hUN1D*lTfM>un;2H1?cm^I(26TR$y@<IVbMyb2`+nEL_rKkB*_Gd1sDInl
zKj+GSxKRJ@h3DgLfO;HP?Y}$&o&nE*XTUSy8So5v20R0v0ndPEz%$?(_#ZGJV#Yp>
zeqW{+as2;ZzyJT?LyY|d_%ZMu;0~|{^nt6u7l1CX1Uvye4*dOtjQtJx5%3K_0at-d
z;GZWL`z7!Ka1MC;1B|^1+ynZ+bHGP{W5Cbf&sYJR0{-{}W4{9K0vhN7e|?;>-vYk@
zUId!Jr-8TL$JqCQmw_GNQ^23!%h;>HD?kA(0dKsAv2OvlfEAzvECGLcjIo!1Zvr{+
z6<`zS08O9)JPEvh9P<Ot0-pi?g&#Nm1iS`(4|o<h348+h81QTSxbh3&+rXEBPXcd&
zZ{qx=BYptkxXe+Au#`IJCt1JQjj}4$!E%F_T4)u8ktp@rc|FNxH|U_)N{uTm`iTsq
zyb5DA7zE49XBl<(MRelCKWk+9Ii84#ESg<vSq95D)veYL!<hzytl)u4HCK@9oW<AL
zf_SS`cO``RnKNpce;tGD%1CF$T0hH@F3*cB7ek?C7-xHF(9g8a#;#3++anvc5IuLy
zn6CY-rd^|I8S2cQ@}Rl2+dLPA7)vb+Tivpy7gSN|1|O(lRY;yyn(vK-wjz8_@>I&0
z>&y<$p`1!dUZgR<D~pV)RB80|JWzjfzHza)@ujeLHPBfenl%8`W->R4w@}5mrfcJ<
zo^s~|EQ9sKnrlA8pR2MgV&yaF1#L8xbz`Cn+~1_<*#Yg+S{1i?L8oC_#6k;*Ic*ke
z7b-9_G@G{sr<&<N4R|0EIi`a)d#aSRlBC}3DmJDHt9bw3sX~-x2|uhKJK4jQ{<I?s
z)?x@bTn3ah*lwm`zUj^bZP0NbSvX@~T{X{j<#m+~6B%rg<xjVRnSFz4InEPUEMKEt
zmBPZf-CvA7Z<#ia{I*JOhlNOoGCWXh3#znK!xYv=ER`_3IYB)tWI4)``O;F$oVJt`
zB^!Luyk{tffizbR@NP<-9a!BrrGBQrv!2E^A%PuamXOod92$(%F4ox9GG1kPve!{s
zsWh*&8_+Du9Qjw^U2JRIIrdLuTWT80Y;0pu5(Z=qvOj5>8``zP^p{pRKBgj(kZN#r
z<I>bcQ8lh;?Z_3Fp2{j3!H1Tz<P)h|d_74(7qoN!fOh?Y8{q<OW0=80sL#HvyD}=2
z93C;r%F?P<!VlwMn&f&U`JhT8t+JHI8HP(U&E=guQIXQggvS!f?kPR0yEBwPZ8WTp
zu<>lb1rf2A6}MY_d!*2fed4f`Dv3?6Be5&tuFxlT`2K<%*jrfkfk?1dtu+!O|J`;q
zs#3F~*4lc(huEFe(LrN4Y9}&kLf1Pm{(Y^nmuPZmhx-B=9Dzphljr!()Tp9t3=2Z|
z#ad@67B`TsVXJ|)stOcYLs{m{CL)i7F$FQ7gouu9y1NtZEMsLlRF^V0z})_D8o_K+
zm>;i78?WfJqw5=1gNT;P0x@P>`_m>(W8WMWmwMI-(5UYKr*^*{T+`m2k*G?oc8_4y
zhF1hua?`aJa>if?tmCT*X7i~+!#KI-%G9t;gJ$=-)HiaMW0+=BfupBO!Q;TzuzZ<}
z<}kZG)LtHj)OyKFBuOST7#Cn>u<waVla7#Yprim#up)Su5k;GLYKK?gm8~g=0cAmR
zzQp*@1|>XkX57ZGLb{Y{>6(zRaUjkFt<_HZzOQ?7i$kuAobRrRF$<T%**>i9S#D-p
zuQQFpXJTfZKv3x_l35q^g8B}r?0$d@rni*%QXmcJ6AvZ`2lQD#*oYi3>X<HT=ZhiR
zt}ZhYQ=tR#8d57ycMwn)J3&RKWzD~~T|<~GTx7H3)sva;U^<y;yxSWogeei3#Iod+
zISK@-6ba8b%k(If2skPwSHSxd`%YkVhi{^}^X3e*eC-^+H;_V8`nxNVO5UR)BX?#-
zZaTWDp+HDCHlyKXE+aLVU@19L7-OkZZFnJv#`O)1#gPw!GI`ZUtXpMKBB1O7E7(T&
zPpdEq>_BVjHV-$pJ0BZc3}V}}!O{r9>O~x5gYd1Vz*jLAiMne&vx(Bvm<+#6gCN4f
z@EN1DuFM4@w|T*Wer(B|Fz2UCQ5%t$4w9cbP+_dxR_)iV*}SwnQxc7Z9?cerWSAA$
zH{<TiXlp}rmfd`|=ER3dHf#e!q#{(qmSQ<jspK2$7cTU|?cVk6aP!*w`AfYk+o8?x
zZ){)M3NMCGVvs`kAtdSj|EG9&|2;tO|MlxAJ^zl%{N)+&40r}S1D*lTfM>un;2H1?
bcm_NJo&nFm|04rSGR1$s=7;MC|5ETDvG6#O

literal 0
HcmV?d00001

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 99897e67..4bfd4dba 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -74,19 +74,19 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
-    Forward_gpu_batched(bottom, top);
-  else
+  //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
+    //Forward_gpu_batched(bottom, top);
+  //else
     Forward_gpu_org(bottom, top);
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
+  //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
     Backward_gpu_batched(top, propagate_down, bottom);
-  else
-    Backward_gpu_org(top, propagate_down, bottom);
+  //else
+    //Backward_gpu_org(top, propagate_down, bottom);
 }
 
 template <typename Dtype>
@@ -132,6 +132,7 @@ void ConvolutionLayer<Dtype>::Forward_gpu_org(
       //two intermediate variables to pass offset
       this->bottom_offset_ = bottom[i]->offset(n);
       this->top_offset_ = top[i]->offset(n);
+      this->col_offset_ = this->K_ * this->N_;
       this->forward_gpu_gemm(bottom_data, weight, top_data);
 
       if (this->bias_term_) {
@@ -156,7 +157,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_batched(const vector<Blob<Dtype>*>& t
     // Bias gradient, if necessary.
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      ocl_memset(bias_diff, (Dtype)(0.), this->blobs_[1]->count());
       for (int n = 0; n < this->num_; ++n) {
         this->top_offset_ = top[i]->offset(n);
         this->backward_gpu_bias(bias_diff, top_diff);
@@ -186,7 +186,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu_batched(const vector<Blob<Dtype>*>& t
       }
     }
   }
-
 }
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
@@ -203,6 +202,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
         //
         this->top_offset_ = top[i]->offset(n);
         this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * this->N_;
         this->backward_gpu_bias(bias_diff, top_diff);
       }
     }
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 241062a1..e9c07970 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -113,7 +113,7 @@ void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
   cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int num_kernels = channels * height * width;
+  int num_kernels = channels * height * width * optnum;
 
   cl_int ret;
   ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);